In [33]:
import re
import nltk
import pandas as pd 
import numpy as np 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as mlt

In [34]:
twitter_df = pd.read_csv('Twitter_Data.csv')

In [35]:
twitter_df

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
...,...,...
162975,why these 456 crores paid neerav modi not reco...,-1.0
162976,dear rss terrorist payal gawar what about modi...,-1.0
162977,did you cover her interaction forum where she ...,0.0
162978,there big project came into india modi dream p...,0.0


In [36]:
output = {
    '-1.0':'negative',
    '0.0':'neutral',
    '1.0':'positive'
}

In [37]:
twitter_df['category'].value_counts()

category
 1.0    72250
 0.0    55213
-1.0    35510
Name: count, dtype: int64

In [38]:
sampled_twitter_df = twitter_df.sample(frac= 0.1, random_state= 41)
sampled_twitter_df

Unnamed: 0,clean_text,category
1975,commentary rahul gandhi’ reluctant bid rival i...,0.0
69482,yes can talk\nvote for modi,0.0
49221,congrats congrats congrats what else need elec...,1.0
41741,udayanidhi stalin son dmk president stalin tue...,0.0
117056,sets tone the run 2019 election masterpiece w...,0.0
...,...,...
12796,arent you modi bhakt then why prefixed chowkid...,0.0
135350,must consider aspirations the young give them ...,1.0
75837,think india association with garudaprakashan a...,0.0
2641,actually modis main purpose that you all campa...,1.0


In [39]:
sampled_twitter_df = sampled_twitter_df.dropna()
sampled_twitter_df

Unnamed: 0,clean_text,category
1975,commentary rahul gandhi’ reluctant bid rival i...,0.0
69482,yes can talk\nvote for modi,0.0
49221,congrats congrats congrats what else need elec...,1.0
41741,udayanidhi stalin son dmk president stalin tue...,0.0
117056,sets tone the run 2019 election masterpiece w...,0.0
...,...,...
12796,arent you modi bhakt then why prefixed chowkid...,0.0
135350,must consider aspirations the young give them ...,1.0
75837,think india association with garudaprakashan a...,0.0
2641,actually modis main purpose that you all campa...,1.0


In [40]:
sampled_twitter_df.dtypes

clean_text     object
category      float64
dtype: object

In [41]:
sampled_twitter_df.isnull().any()

clean_text    False
category      False
dtype: bool

In [42]:
sampled_twitter_df['category'].value_counts()

category
 1.0    7229
 0.0    5546
-1.0    3522
Name: count, dtype: int64

In [43]:
lemmatizer= WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [44]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]','', text)
    tokens = word_tokenize(text)
    cleaned = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(cleaned)

In [45]:
sampled_twitter_df['cleaned_text'] = sampled_twitter_df['clean_text'].apply(preprocess)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sampled_twitter_df['cleaned_text'] = sampled_twitter_df['clean_text'].apply(preprocess)


In [46]:
sampled_twitter_df.drop(columns=['clean_text'], inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sampled_twitter_df.drop(columns=['clean_text'], inplace= True)


In [47]:
sampled_twitter_df

Unnamed: 0,category,cleaned_text
1975,0.0,commentary rahul gandhi reluctant bid rival in...
69482,0.0,yes talk vote modi
49221,1.0,congrats congrats congrats else need elect mod...
41741,0.0,udayanidhi stalin son dmk president stalin tue...
117056,0.0,set tone run election masterpiece question mod...
...,...,...
12796,0.0,arent modi bhakt prefixed chowkidar handle name
135350,1.0,must consider aspiration young give many oppor...
75837,0.0,think india association garudaprakashan organi...
2641,1.0,actually modis main purpose campaign


In [48]:
vecterizor = TfidfVectorizer()
X = vecterizor.fit_transform(sampled_twitter_df['cleaned_text'])
y = sampled_twitter_df['category']

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state= 41, test_size= 0.2)

In [50]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [52]:
predict = model.predict(X_test)

In [53]:
print(classification_report(y_test, predict))

              precision    recall  f1-score   support

        -1.0       0.89      0.06      0.11       704
         0.0       0.80      0.22      0.35      1122
         1.0       0.48      0.98      0.65      1434

    accuracy                           0.52      3260
   macro avg       0.73      0.42      0.37      3260
weighted avg       0.68      0.52      0.43      3260



In [56]:
sample_text = 'yes talk vote modi'
preprocess_text = preprocess(sample_text)
sample_vector = vecterizor.transform([preprocess_text])
predicted = model.predict(sample_vector)
index = f"{predicted[0]}"
print(f"The text sentiment is {output[index]}")


KeyError: '1.0'