In [1]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pranjalthapliyal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Load the data from the CSV file
df = pd.read_csv('sentiment_data_100k.csv', encoding='ISO-8859-1')


In [3]:
df.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,4,1881672289,Fri May 22 05:16:44 PDT 2009,NO_QUERY,viry_trivium,"Happy birthday, sister!"
1,4,2009051656,Tue Jun 02 15:04:22 PDT 2009,NO_QUERY,Earlthedog,Just finished eating supper and now I am attac...
2,0,2211886069,Wed Jun 17 13:24:27 PDT 2009,NO_QUERY,StefyyMarie,i hate love right now.
3,4,1558734942,Sun Apr 19 09:15:07 PDT 2009,NO_QUERY,tezzer57,"Photo fest in LDN, Tudor feast last night, don..."
4,4,1834470136,Mon May 18 03:03:30 PDT 2009,NO_QUERY,dave_sherratt,"@piercedbrat happy bday for tomoz, all the bes..."


In [4]:
# Add column headers as follows : the first column is the 'target', the second column is the 'ids' of the tweet, the third column is the 'date', fourth column 'flag', fifth column 'user', and the last sixth column is the 'text'.

df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']

In [5]:
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,4,1881672289,Fri May 22 05:16:44 PDT 2009,NO_QUERY,viry_trivium,"Happy birthday, sister!"
1,4,2009051656,Tue Jun 02 15:04:22 PDT 2009,NO_QUERY,Earlthedog,Just finished eating supper and now I am attac...
2,0,2211886069,Wed Jun 17 13:24:27 PDT 2009,NO_QUERY,StefyyMarie,i hate love right now.
3,4,1558734942,Sun Apr 19 09:15:07 PDT 2009,NO_QUERY,tezzer57,"Photo fest in LDN, Tudor feast last night, don..."
4,4,1834470136,Mon May 18 03:03:30 PDT 2009,NO_QUERY,dave_sherratt,"@piercedbrat happy bday for tomoz, all the bes..."


In [6]:
# Keep only the 'text' and 'target' columns
data = df[['text', 'target']]

# Convert the target values to numerical values
data['target'] = df['target'].replace([0, 2, 4], ['negative', 'neutral', 'positive'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['target'] = df['target'].replace([0, 2, 4], ['negative', 'neutral', 'positive'])


In [7]:
data.head()

Unnamed: 0,text,target
0,"Happy birthday, sister!",positive
1,Just finished eating supper and now I am attac...,positive
2,i hate love right now.,negative
3,"Photo fest in LDN, Tudor feast last night, don...",positive
4,"@piercedbrat happy bday for tomoz, all the bes...",positive


In [8]:
# # Count the number of positive and negative and neutral tweets

# df['target'].value_counts()

In [9]:
# Shuffle the DataFrame and select the first 10000 rows
data = data.sample(frac=1).reset_index(drop=True)[:10000]

In [10]:
data.head()

Unnamed: 0,text,target
0,... 31 days until I leave for Ontario!!,positive
1,Tried to have my picture taken with @jimmycarr...,negative
2,@LeslieSanchez http://twitpic.com/69esj - Sean...,positive
3,"Praying for sunshine - don't fancy play doh, p...",positive
4,tonight I'm gonna start again. I don't love he...,negative


In [11]:
# Count the number of positive and negative and neutral tweets

data['target'].value_counts()

positive    5007
negative    4993
Name: target, dtype: int64

In [12]:
# preprocess the text data
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'https?://\S+', '', text)  # remove urls
    text = re.sub(r'<.*?>', '', text)  # remove html tags
    text = re.sub(r'@\w+', '', text)  # remove mentions
    text = re.sub(r'#\w+', '', text)  # remove hashtags
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = text.lower()  # convert to lowercase
    tokens = nltk.word_tokenize(text)  # tokenize
    filtered_tokens = [word for word in tokens if word not in stop_words]  # remove stop words
    text = ' '.join(filtered_tokens)  # join tokens
    return text

In [13]:
data['text'] = data['text'].apply(preprocess_text)

In [14]:
# split the data into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(data['text'], data['target'], test_size=0.2, random_state=42)

In [15]:
# vectorize the training data
vectorizer = CountVectorizer()
train_vectors = vectorizer.fit_transform(train_data)


In [16]:
# train a multinomial naive bayes classifier
clf = MultinomialNB()
clf.fit(train_vectors, train_labels)

In [17]:
# predict the test data and save to CSV file
test_vectors = vectorizer.transform(test_data)
test_predictions = clf.predict(test_vectors)
test_predictions_df = pd.DataFrame({'text': test_data, 'predicted_emotion': test_predictions})
test_predictions_df.to_csv('predicted_outputs.csv', index=False)

print("Model training and predictions completed successfully!")

Model training and predictions completed successfully!


In [18]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# print the accuracy score
print("Accuracy score: ", accuracy_score(test_labels, test_predictions))

# print the classification report
print("Classification report: \n", classification_report(test_labels, test_predictions))

# print the confusion matrix
print("Confusion matrix: \n", confusion_matrix(test_labels, test_predictions))

Accuracy score:  0.723
Classification report: 
               precision    recall  f1-score   support

    negative       0.70      0.77      0.73       985
    positive       0.75      0.68      0.71      1015

    accuracy                           0.72      2000
   macro avg       0.73      0.72      0.72      2000
weighted avg       0.73      0.72      0.72      2000

Confusion matrix: 
 [[759 226]
 [328 687]]
