# Sentiment analysis of posts from different social media sites

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("sent_analysis.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Comment,Sentiment
0,0,i seriously hate one subject to death but now ...,neutral
1,1,im so full of life i feel appalled,negative
2,2,i sit here to write i start to dig out my feel...,neutral
3,3,ive been really angry with r and i feel like a...,positive
4,4,i feel suspicious if there is no one outside l...,neutral


__*Though some values look incorrect on first glance, the full sentences are in accordance with the sentiment label*__

In [3]:
from sklearn.preprocessing import LabelEncoder

df["Sentiment"] = LabelEncoder().fit_transform(df["Sentiment"])
df.head()

Unnamed: 0.1,Unnamed: 0,Comment,Sentiment
0,0,i seriously hate one subject to death but now ...,1
1,1,im so full of life i feel appalled,0
2,2,i sit here to write i start to dig out my feel...,1
3,3,ive been really angry with r and i feel like a...,2
4,4,i feel suspicious if there is no one outside l...,1


In [4]:
df["Sentiment"].value_counts()

0    2000
2    2000
1    1937
Name: Sentiment, dtype: int64

### Loading spacy english model

In [5]:
import spacy

nlp = spacy.load("en_core_web_sm")

def preprocess(x):
    doc = nlp(x)
    non_stop_tokens = []
    for token in doc:
        if not token.is_stop and not token.is_punct:
            non_stop_tokens.append(token.lemma_)
    return " ".join(non_stop_tokens)

df["processed_text"] = df["Comment"].apply(preprocess)
df.head()

Unnamed: 0.1,Unnamed: 0,Comment,Sentiment,processed_text
0,0,i seriously hate one subject to death but now ...,1,seriously hate subject death feel reluctant drop
1,1,im so full of life i feel appalled,0,m life feel appalled
2,2,i sit here to write i start to dig out my feel...,1,sit write start dig feeling think afraid accep...
3,3,ive been really angry with r and i feel like a...,2,ve angry r feel like idiot trust place
4,4,i feel suspicious if there is no one outside l...,1,feel suspicious outside like rapture happen


In [6]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(df["Comment"],df["Sentiment"],test_size=0.2)


## Creating a pipeline of TFIDF vectorizer and Naive Bayes classifier

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

model1 = Pipeline([
    ("Vectorizer part" , TfidfVectorizer(ngram_range=(1,2))),
    ("Naive-Bayes Part" , MultinomialNB())
])


### Fitting unprocessed text

In [8]:
model1.fit(x_train,y_train)

In [9]:
model1.score(x_test,y_test)

0.8493265993265994

In [10]:
x_train_proc,x_test_proc,y_train,y_test = train_test_split(df["processed_text"],df["Sentiment"],test_size=0.2)
model1.fit(x_train_proc,y_train)

### Fitting Proccessed Text

In [11]:
model1.score(x_test_proc,y_test)

0.9250841750841751

## Creating Pipeline with Countvectorizer and Naive Bayes Classifier

### Fitting unproccessed text

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

model2 = Pipeline([
    ("Count Vectorizer part" , CountVectorizer(ngram_range=(1,2))),
    ("Naive-Bayes Part" , MultinomialNB())
])

model2.fit(x_train,y_train)
model2.score(x_test,y_test)

0.34175084175084175

### Fitting proccessed text

In [13]:
model2.fit(x_train_proc,y_train)
model2.score(x_test_proc,y_test)

0.9175084175084175

This suggests that processed text performs much better than unprocessed text

## Creating a pipeline of TFIDF vectorizer and Random Forest classifier

In [14]:
from sklearn.ensemble import RandomForestClassifier

model3 = Pipeline([
    ("Vectorizer part" , TfidfVectorizer(ngram_range=(1,2))),
    ("Random Forest Part" , RandomForestClassifier(n_estimators=10))
])


In [15]:
model3.fit(x_train_proc,y_train)

In [16]:
model3.score(x_test_proc,y_test)

0.8863636363636364

## Using Deep learning

Using word embedding because tfidf would give a vector about 300000 long and has crashed my computer

In [17]:
x_vectorized = [nlp(x).vector for x in list(df["processed_text"])]
x_vectorized

[array([-1.7931895e-01, -2.7498448e-01,  5.2552126e-02, -3.1157392e-01,
        -1.4767715e-01, -1.9336346e-01,  2.3145019e-01,  9.1706261e-02,
         2.3559973e-01, -5.9662435e-02, -4.2517170e-01, -5.9158307e-01,
        -2.7385059e-01, -3.4034851e-01,  2.9486971e-02,  5.6284845e-01,
         1.8835652e-01, -9.1856760e-01, -3.6040118e-01, -1.8199386e-01,
        -7.2205007e-02,  8.5671198e-01, -7.3948222e-01, -1.9110216e-01,
         3.1509873e-01, -2.3204722e-01, -2.2182362e-01,  9.2725092e-01,
         1.6476867e-01,  6.7865103e-01,  1.5175651e-01,  6.9145977e-01,
        -3.8501376e-01, -5.8223474e-01,  1.2673715e-01, -5.2417982e-01,
         2.0646934e-01, -4.0819311e-01,  2.2345614e-01, -3.2748702e-01,
        -4.0825590e-01, -1.2376205e-01,  7.6467521e-02,  4.7707516e-01,
         4.1962439e-01,  2.4416688e-01, -1.1206361e-01, -4.5334449e-01,
         2.0863709e-01, -3.4469895e-02, -3.2250160e-01,  3.1399539e-01,
        -3.7337095e-01, -3.2387835e-01, -5.4954257e-02,  2.13364

In [21]:
x_vectorized = np.array(x_vectorized)

In [22]:
x_vec_train,x_vec_test,y_vec_train,y_vec_test = train_test_split(x_vectorized,df["Sentiment"],test_size=0.2)
x_vec_train.shape

(4749, 96)

### A Simple Dense layer with layer inputs selected with trial and error

In [33]:
from keras import models, layers

model4 = models.Sequential([
    layers.Dense(96,activation='relu'),
    layers.Dense(150,activation='relu'),
    layers.Dense(150,activation='relu'),
    layers.Dense(10,activation='relu'),
    layers.Dense(3,activation='softmax'),
])

In [34]:
model4.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

### Fitting Word Embedded texts

In [35]:
model4.fit(x_vec_train,y_vec_train,epochs=25)

Epoch 1/25


Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.src.callbacks.History at 0x2557fdbe800>

In [36]:
model4.evaluate(x_vec_test,y_vec_test)



[3.117119550704956, 0.4124579131603241]

### Thus, a pipeline of TFIDF vectorization with naive Bayes Classifier on processed text gives maximum accuracy (.92929) for this particular dataset

Note : Accuracy may change as I forgot to initialize random state