In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('fb_sentiment.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,FBPost,Label
0,0,Drug Runners and a U.S. Senator have somethin...,O
1,1,"Heres a single, to add, to Kindle. Just read t...",O
2,2,If you tire of Non-Fiction.. Check out http://...,O
3,3,Ghost of Round Island is supposedly nonfiction.,O
4,4,Why is Barnes and Nobles version of the Kindle...,N


In [4]:
df.tail()

Unnamed: 0.1,Unnamed: 0,FBPost,Label
995,995,I liked it. Its youth oriented and I think th...,P
996,996,"I think the point of the commercial is that, e...",P
997,997,Kindle 3 is such a great product. I could not ...,P
998,998,develop a way to share books! that is a big d...,N
999,999,I love my kindle! =),P


In [5]:
df = df.drop(columns=['Unnamed: 0'])

## Identify Missing Values

In [6]:
df.columns = ['text', 'label']

In [7]:
print(df.isnull().sum())

text     0
label    0
dtype: int64


In [8]:
df['label'] = df['label'].fillna('neutral')

In [9]:
print(df.isnull().sum())

text     0
label    0
dtype: int64


In [10]:
label_map = {'O':'natural', 'N':'negative', 'P':'positive'}

In [11]:
df['label'] = df['label'].map(label_map)

In [12]:
df.head()

Unnamed: 0,text,label
0,Drug Runners and a U.S. Senator have somethin...,natural
1,"Heres a single, to add, to Kindle. Just read t...",natural
2,If you tire of Non-Fiction.. Check out http://...,natural
3,Ghost of Round Island is supposedly nonfiction.,natural
4,Why is Barnes and Nobles version of the Kindle...,negative


In [13]:
df.tail()

Unnamed: 0,text,label
995,I liked it. Its youth oriented and I think th...,positive
996,"I think the point of the commercial is that, e...",positive
997,Kindle 3 is such a great product. I could not ...,positive
998,develop a way to share books! that is a big d...,negative
999,I love my kindle! =),positive


### Preprocess the Text

In [14]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [15]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [16]:
text = "Hello world! This is a test."
tokens = word_tokenize(text)
print(tokens)


['Hello', 'world', '!', 'This', 'is', 'a', 'test', '.']


In [17]:
try:
    stop_words = set(stopwords.words('english'))
except LookupError:
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+|#', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.lower()
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

df['cleaned_text'] = df['text'].apply(clean_text)
print(df[['text', 'cleaned_text']].head())


                                                text  \
0  Drug Runners and  a U.S. Senator have somethin...   
1  Heres a single, to add, to Kindle. Just read t...   
2  If you tire of Non-Fiction.. Check out http://...   
3    Ghost of Round Island is supposedly nonfiction.   
4  Why is Barnes and Nobles version of the Kindle...   

                                        cleaned_text  
0  drug runners us senator something murder state...  
1  heres single add kindle read 19th century stor...  
2                              tire nonfiction check  
3           ghost round island supposedly nonfiction  
4  barnes nobles version kindle much expensive ki...  


In [18]:
df['cleand_text'] = df['text'].apply(clean_text)

In [19]:
print(df[['text', 'cleaned_text']].head())

                                                text  \
0  Drug Runners and  a U.S. Senator have somethin...   
1  Heres a single, to add, to Kindle. Just read t...   
2  If you tire of Non-Fiction.. Check out http://...   
3    Ghost of Round Island is supposedly nonfiction.   
4  Why is Barnes and Nobles version of the Kindle...   

                                        cleaned_text  
0  drug runners us senator something murder state...  
1  heres single add kindle read 19th century stor...  
2                              tire nonfiction check  
3           ghost round island supposedly nonfiction  
4  barnes nobles version kindle much expensive ki...  


### Encode Sentiment Labels

In [20]:
label_map = {'natural':0, 'negative':1, 'positive':2}
df['label'] = df['label'].map(label_map)

In [21]:
df.head()

Unnamed: 0,text,label,cleaned_text,cleand_text
0,Drug Runners and a U.S. Senator have somethin...,0,drug runners us senator something murder state...,drug runners us senator something murder state...
1,"Heres a single, to add, to Kindle. Just read t...",0,heres single add kindle read 19th century stor...,heres single add kindle read 19th century stor...
2,If you tire of Non-Fiction.. Check out http://...,0,tire nonfiction check,tire nonfiction check
3,Ghost of Round Island is supposedly nonfiction.,0,ghost round island supposedly nonfiction,ghost round island supposedly nonfiction
4,Why is Barnes and Nobles version of the Kindle...,1,barnes nobles version kindle much expensive ki...,barnes nobles version kindle much expensive ki...


### Split the Dataset

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['label'], test_size = 0.2, random_state = 42)

In [24]:
print("Trainning data", X_train.shape, y_train.shape)
print("Testing data", X_test.shape, y_test.shape)

Trainning data (800,) (800,)
Testing data (200,) (200,)


### Vectorize the Text

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
vectorizer  = TfidfVectorizer()

In [27]:
X_train_tfidf = vectorizer.fit_transform(X_train)

In [28]:
X_test_tfidf = vectorizer.transform(X_test)

In [29]:
print("Trainning the shape",  X_train_tfidf.shape)
print("Tetsting the shape", X_test_tfidf.shape)

Trainning the shape (800, 2480)
Tetsting the shape (200, 2480)


###  Train a Model

In [30]:
# # from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import classification_report, accuracy_score

In [31]:
# model = LogisticRegression()
# model.fit(X_train_tfidf, y_train)


In [32]:
# y_pred = model.predict(X_test_tfidf)

In [33]:
# print("Accuracy Score::: ", accuracy_score(y_test, y_pred))
# print("Classification Report:::\n", classification_report(y_test, y_pred))

In [34]:
from sklearn.ensemble import RandomForestClassifier

In [35]:
model = RandomForestClassifier(n_estimators = 1000, random_state = 42)

In [36]:
model.fit(X_train_tfidf, y_train)

In [37]:
y_pred = model.predict(X_test_tfidf)

In [38]:
print("Random Forest Accuracy Score:::", accuracy_score(y_test, y_pred))
print("Random Forest Classification Report:::\n", classification_report(y_test, y_pred))

Random Forest Accuracy Score::: 0.76
Random Forest Classification Report:::
               precision    recall  f1-score   support

           0       0.65      0.65      0.65        63
           1       1.00      0.08      0.15        12
           2       0.81      0.88      0.84       125

    accuracy                           0.76       200
   macro avg       0.82      0.54      0.55       200
weighted avg       0.77      0.76      0.74       200



In [39]:
import joblib

In [40]:
# Save the model
joblib.dump(model, 'sentiment_model.pkl')

# Save the vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']