##                  Task # 2 : Sentimental Analysis on Tweets of Twitter

## Importing libraries and basic data wrangling

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
columns = ['polarity', 'id', 'date', 'query', 'user', 'tweet']
df = pd.read_csv('training.1600000.processed.noemoticon.csv', header=None, names=columns, encoding='ISO-8859-1')

In [4]:
df.shape

(1600000, 6)

In [5]:
df.head()

Unnamed: 0,polarity,id,date,query,user,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   polarity  1600000 non-null  int64 
 1   id        1600000 non-null  int64 
 2   date      1600000 non-null  object
 3   query     1600000 non-null  object
 4   user      1600000 non-null  object
 5   tweet     1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [9]:
df['polarity'].value_counts()

0    800000
4    800000
Name: polarity, dtype: int64

In [10]:
df.isnull().sum()

polarity    0
id          0
date        0
query       0
user        0
tweet       0
dtype: int64

In [11]:
df.duplicated().sum()

0

## Text preprocessing

In [12]:
import re
import nltk
#nltk.download()
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


In [13]:
def clean_tweet(tweet):
    tweet = re.sub('@[A-Za-z0-9_]+', '', tweet)  # Remove @mentions
    tweet = re.sub('https?://[A-Za-z0-9./]+', '', tweet)  # Remove URLs
    tweet = re.sub('[^a-zA-Z]', ' ', tweet)  # Remove non-alphabetic characters
    tweet = tweet.lower()  # Convert to lowercase
    tweet = word_tokenize(tweet)  # Tokenization
    tweet = [word for word in tweet if word not in set(stopwords.words('english'))]  # Remove stopwords
    stemmer = PorterStemmer()
    tweet = [stemmer.stem(word) for word in tweet]  # Stemming
    return ' '.join(tweet)

In [14]:
df['cleaned_tweet'] = df['tweet'].apply(clean_tweet)

In [15]:
from sklearn.model_selection import train_test_split

x = df['cleaned_tweet']
y = df['polarity']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=100)



## Feature engineering using TF-IDF

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust the max_features
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)
x_test_tfidf = tfidf_vectorizer.transform(x_test)

## Model Training (Naive Bayes)

In [17]:
from sklearn.naive_bayes import MultinomialNB

nb_classifier = MultinomialNB()
nb_classifier.fit(x_train_tfidf, y_train)

## Model Evaluation

In [18]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = nb_classifier.predict(x_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.756015625
              precision    recall  f1-score   support

           0       0.76      0.75      0.75    160434
           4       0.75      0.77      0.76    159566

    accuracy                           0.76    320000
   macro avg       0.76      0.76      0.76    320000
weighted avg       0.76      0.76      0.76    320000

