# Sentiment analysis of posts from different social media sites

In [None]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("sent_analysis.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Comment,Sentiment
0,0,i seriously hate one subject to death but now ...,neutral
1,1,im so full of life i feel appalled,negative
2,2,i sit here to write i start to dig out my feel...,neutral
3,3,ive been really angry with r and i feel like a...,positive
4,4,i feel suspicious if there is no one outside l...,neutral


__*Though some values look incorrect on first glance, the full sentences are in accordance with the sentiment label*__

In [3]:
from sklearn.preprocessing import LabelEncoder

df["Sentiment"] = LabelEncoder().fit_transform(df["Sentiment"])
df.head()

Unnamed: 0.1,Unnamed: 0,Comment,Sentiment
0,0,i seriously hate one subject to death but now ...,1
1,1,im so full of life i feel appalled,0
2,2,i sit here to write i start to dig out my feel...,1
3,3,ive been really angry with r and i feel like a...,2
4,4,i feel suspicious if there is no one outside l...,1


In [4]:
df["Sentiment"].value_counts()

0    2000
2    2000
1    1937
Name: Sentiment, dtype: int64

### Loading spacy english model

In [5]:
import spacy

nlp = spacy.load("en_core_web_sm")

def preprocess(x):
    doc = nlp(x)
    non_stop_tokens = []
    for token in doc:
        if not token.is_stop and not token.is_punct:
            non_stop_tokens.append(token.lemma_)
    return " ".join(non_stop_tokens)

df["processed_text"] = df["Comment"].apply(preprocess)
df.head()

Unnamed: 0.1,Unnamed: 0,Comment,Sentiment,processed_text
0,0,i seriously hate one subject to death but now ...,1,seriously hate subject death feel reluctant drop
1,1,im so full of life i feel appalled,0,m life feel appalled
2,2,i sit here to write i start to dig out my feel...,1,sit write start dig feeling think afraid accep...
3,3,ive been really angry with r and i feel like a...,2,ve angry r feel like idiot trust place
4,4,i feel suspicious if there is no one outside l...,1,feel suspicious outside like rapture happen


In [6]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(df["Comment"],df["Sentiment"],test_size=0.2)


## Creating a pipeline of TFIDF vectorizer and Naive Bayes classifier

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

model1 = Pipeline([
    ("Vectorizer part" , TfidfVectorizer(ngram_range=(1,2))),
    ("Naive-Bayes Part" , MultinomialNB())
])


### Fitting unprocessed text

In [14]:
model1.fit(x_train,y_train)

In [15]:
model1.score(x_test,y_test)

0.3367003367003367

In [16]:
x_train_proc,x_test_proc,y_train,y_test = train_test_split(df["processed_text"],df["Sentiment"],test_size=0.2)
model1.fit(x_train_proc,y_train)

### Fitting Proccessed Text

In [17]:
model1.score(x_test_proc,y_test)

0.9292929292929293

## Creating Pipeline with Countvectorizer and Naive Bayes Classifier

### Fitting unproccessed text

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

model2 = Pipeline([
    ("Count Vectorizer part" , CountVectorizer(ngram_range=(1,2))),
    ("Naive-Bayes Part" , MultinomialNB())
])

model2.fit(x_train,y_train)
model2.score(x_test,y_test)

### Fitting proccessed text

In [19]:
model2.fit(x_train_proc,y_train)
model2.score(x_test_proc,y_test)

0.92003367003367

This suggests that processed text performs much better than unprocessed text

## Creating a pipeline of TFIDF vectorizer and Random Forest classifier

In [20]:
from sklearn.ensemble import RandomForestClassifier

model3 = Pipeline([
    ("Vectorizer part" , TfidfVectorizer(ngram_range=(1,2))),
    ("Random Forest Part" , RandomForestClassifier(n_estimators=10))
])


In [21]:
model3.fit(x_train_proc,y_train)

In [22]:
model3.score(x_test_proc,y_test)

0.9023569023569024

In [23]:
x_vectorized = TfidfVectorizer(ngram_range=(1,2)).fit_transform(df["processed_text"])
x_vectorized

<5937x38437 sparse matrix of type '<class 'numpy.float64'>'
	with 87851 stored elements in Compressed Sparse Row format>

In [24]:
x_vectorized.shape

(5937, 38437)

In [26]:
x_vec_train,x_vec_test,y_vec_train,y_vec_test = train_test_split(x_vectorized,df["Sentiment"],test_size=0.2)
x_vec_train.shape

(4749, 38437)

In [27]:
from keras import models, layers

model4 = models.Sequential([
    layers.Dense(38437,activation='relu'),
    layers.Dense(5000,activation='relu'),
    layers.Dense(200,activation='relu'),
    layers.Dense(3,activation='softmax'),
])

In [28]:
model4.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

### Aborted as it has too many neurons and crashed the computer

In [29]:
# model4.fit(x_vec_train,y_vec_train,epochs=10)

Epoch 1/10


### Thus, a pipeline of TFIDF vectorization with naive Bayes Classifier on processed text gives maximum accuracy (.92929) for this particular dataset

Note : Accuracy may change as I forgot to initialize random state