In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score



In [2]:
from nltk.corpus import stopwords  # Import specific resource

# Download resources when first used (if not already downloaded)
stopwords.words('english')


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [3]:
train_data=pd.read_csv('twitter_training.csv',encoding='ISO-8859-1')
test_data=pd.read_csv('twitter_validation.csv',encoding='ISO-8859-1')

In [4]:
train_data.shape


(74681, 4)

In [5]:
test_data.shape

(999, 4)

In [6]:
train_data.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [7]:
column_names=['flag','bord','target','text']
train_data=pd.read_csv('twitter_training.csv',names=column_names,encoding='ISO-8859-1')
test_data=pd.read_csv('twitter_validation.csv',names=column_names,encoding='ISO-8859-1')

In [8]:
train_data.shape

(74682, 4)

In [9]:
train_data.isnull().sum()

flag        0
bord        0
target      0
text      686
dtype: int64

In [10]:
import pandas as pd

# Assuming you have already imported pandas

# Drop rows with missing values (NaN or empty strings) in the 'text' column
train_data_cleaned = train_data.dropna(subset=['text'],inplace=True)

# Alternative (if you only want to remove rows with empty strings):
# train_data_cleaned = train_data[train_data['text'].str.strip() != '']
train_data.isnull().sum()

flag      0
bord      0
target    0
text      0
dtype: int64

In [11]:
test_data.isnull().sum()

flag      0
bord      0
target    0
text      0
dtype: int64

In [12]:
train_data.shape

(73996, 4)

In [13]:
train_data['target'].value_counts(),test_data['target'].value_counts()

(Negative      22358
 Positive      20655
 Neutral       18108
 Irrelevant    12875
 Name: target, dtype: int64,
 Neutral       285
 Positive      277
 Negative      266
 Irrelevant    172
 Name: target, dtype: int64)

In [14]:
train_data.head()

Unnamed: 0,flag,bord,target,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [15]:
mapping = {'Negative': 0, 'Positive': 1, 'Neutral': 2, 'Irrelevant': 3}

# Replace the values using the mapping dictionary
train_data['target'] = train_data['target'].replace(mapping)
test_data['target'] = test_data['target'].replace(mapping)

print(train_data)
print(train_data['target'].value_counts())

       flag         bord  target  \
0      2401  Borderlands       1   
1      2401  Borderlands       1   
2      2401  Borderlands       1   
3      2401  Borderlands       1   
4      2401  Borderlands       1   
...     ...          ...     ...   
74677  9200       Nvidia       1   
74678  9200       Nvidia       1   
74679  9200       Nvidia       1   
74680  9200       Nvidia       1   
74681  9200       Nvidia       1   

                                                    text  
0      im getting on borderlands and i will murder yo...  
1      I am coming to the borders and I will kill you...  
2      im getting on borderlands and i will kill you ...  
3      im coming on borderlands and i will murder you...  
4      im getting on borderlands 2 and i will murder ...  
...                                                  ...  
74677  Just realized that the Windows partition of my...  
74678  Just realized that my Mac window partition is ...  
74679  Just realized the windows par

In [16]:
train_data['target'].value_counts()
# train_data.head()

0    22358
1    20655
2    18108
3    12875
Name: target, dtype: int64

In [17]:

port_stem=PorterStemmer()

In [18]:
def stemming(content):
    stemmed_centent = re.sub('[^a-zA-Z]',' ',content)
    stemmed_centent=stemmed_centent.lower()
    stemmed_centent=stemmed_centent.split() 
    stemmed_centent=[port_stem.stem(word) for word in stemmed_centent if not word in stopwords.words('english')]
    stemmed_centent= ' '.join(stemmed_centent)
    
    return stemmed_centent
    

In [19]:
train_data['stemmed_content'] = train_data['text'].apply(stemming)
test_data['stemmed_content'] = test_data['text'].apply(stemming)


In [21]:
train_data.head()

Unnamed: 0,flag,bord,target,text,stemmed_content
0,2401,Borderlands,1,im getting on borderlands and i will murder yo...,im get borderland murder
1,2401,Borderlands,1,I am coming to the borders and I will kill you...,come border kill
2,2401,Borderlands,1,im getting on borderlands and i will kill you ...,im get borderland kill
3,2401,Borderlands,1,im coming on borderlands and i will murder you...,im come borderland murder
4,2401,Borderlands,1,im getting on borderlands 2 and i will murder ...,im get borderland murder


In [22]:
test_data.head()

Unnamed: 0,flag,bord,target,text,stemmed_content
0,3364,Facebook,3,I mentioned on Facebook that I was struggling ...,mention facebook struggl motiv go run day tran...
1,352,Amazon,2,BBC News - Amazon boss Jeff Bezos rejects clai...,bbc news amazon boss jeff bezo reject claim co...
2,8312,Microsoft,0,@Microsoft Why do I pay for WORD when it funct...,microsoft pay word function poorli samsungu ch...
3,4371,CS-GO,0,"CSGO matchmaking is so full of closet hacking,...",csgo matchmak full closet hack truli aw game
4,4433,Google,2,Now the President is slapping Americans in the...,presid slap american face realli commit unlaw ...


In [23]:
X_train=train_data['stemmed_content']
y_train=train_data['target'].values
X_test=test_data['stemmed_content']
y_test=test_data['target'].values

In [24]:
vectorizor=TfidfVectorizer()

X_train=vectorizor.fit_transform(X_train)
X_test=vectorizor.transform(X_test)

In [25]:
print(X_train)

  (0, 12545)	0.672460959876397
  (0, 2122)	0.4325675736546788
  (0, 7341)	0.32489583024771473
  (0, 9088)	0.5050982589083702
  (1, 10336)	0.4966318265888706
  (1, 2120)	0.7452188611559063
  (1, 3458)	0.4449782891293641
  (2, 10336)	0.5345301888165152
  (2, 2122)	0.49394422039680846
  (2, 7341)	0.3709950244906562
  (2, 9088)	0.5767662231645913
  (3, 3458)	0.4054038888193035
  (3, 12545)	0.6499836346024973
  (3, 2122)	0.41810879814782276
  (3, 9088)	0.48821511098131337
  (4, 12545)	0.672460959876397
  (4, 2122)	0.4325675736546788
  (4, 7341)	0.32489583024771473
  (4, 9088)	0.5050982589083702
  (5, 12545)	0.672460959876397
  (5, 2122)	0.4325675736546788
  (5, 7341)	0.32489583024771473
  (5, 9088)	0.5050982589083702
  (6, 9798)	0.25052583390447347
  (6, 21164)	0.28974429718140715
  :	:
  (73993, 15566)	0.3088104796360347
  (73993, 21910)	0.20383219841601852
  (73994, 13997)	0.42400408152426794
  (73994, 13240)	0.2020200182081118
  (73994, 5128)	0.2876604671630894
  (73994, 21313)	0.2717926

In [26]:
print(X_test)


  (0, 19553)	0.26778229715583673
  (0, 19420)	0.24918602152655164
  (0, 19415)	0.2119521870969451
  (0, 19159)	0.15520742206881216
  (0, 18937)	0.20046984994510514
  (0, 18312)	0.23563920410016562
  (0, 16409)	0.17767357096037106
  (0, 14227)	0.18787711783252026
  (0, 12403)	0.2821438894839283
  (0, 11902)	0.22355348867579108
  (0, 10735)	0.25000079642825723
  (0, 8245)	0.3310907298106685
  (0, 7805)	0.1439014304417503
  (0, 7773)	0.29128483777540726
  (0, 7585)	0.1258886900577541
  (0, 7341)	0.11662137700819558
  (0, 6081)	0.14340766715848072
  (0, 4290)	0.14303519378457
  (0, 1543)	0.2508370152181519
  (0, 1110)	0.3065230230794079
  (1, 20008)	0.1710007926454787
  (1, 15761)	0.246530134404228
  (1, 12869)	0.30147231806974883
  (1, 10952)	0.10853874001065932
  (1, 9738)	0.22979482668931536
  :	:
  (997, 14455)	0.1666132118858349
  (997, 8211)	0.25745644647412097
  (997, 5121)	0.40739212218503146
  (997, 4290)	0.2164128148321547
  (997, 3458)	0.22778580528821776
  (997, 2122)	0.2349243

In [27]:
model=LogisticRegression(max_iter=1000)

In [28]:
model.fit(X_train,y_train)

In [29]:
X_train_pred=model.predict(X_train)
training_acc=accuracy_score(y_train,X_train_pred)

In [30]:
print('accuracy=',training_acc)

accuracy= 0.8426671711984431


In [31]:
X_test_pred=model.predict(X_test)
testing_acc=accuracy_score(y_test,X_test_pred)

In [32]:
print('final accuracy=',testing_acc)

final accuracy= 0.89


In [33]:
import pickle


saving the training model

In [34]:
filename='trained_model.sav'
pickle.dump(model,open(filename,'wb'))

using the saved model 

In [35]:
loaded_model=pickle.load(open('trained_model.sav','rb'))

In [36]:

X_new=X_test[200]
print(y_test[200])

print('sentence',X_test[0])


prediction=loaded_model.predict(X_new)
print(prediction)

if (prediction [0] == 0):
    print ('Negative tweet')
else:
    print('Positive tweet')

2
sentence   (0, 19553)	0.26778229715583673
  (0, 19420)	0.24918602152655164
  (0, 19415)	0.2119521870969451
  (0, 19159)	0.15520742206881216
  (0, 18937)	0.20046984994510514
  (0, 18312)	0.23563920410016562
  (0, 16409)	0.17767357096037106
  (0, 14227)	0.18787711783252026
  (0, 12403)	0.2821438894839283
  (0, 11902)	0.22355348867579108
  (0, 10735)	0.25000079642825723
  (0, 8245)	0.3310907298106685
  (0, 7805)	0.1439014304417503
  (0, 7773)	0.29128483777540726
  (0, 7585)	0.1258886900577541
  (0, 7341)	0.11662137700819558
  (0, 6081)	0.14340766715848072
  (0, 4290)	0.14303519378457
  (0, 1543)	0.2508370152181519
  (0, 1110)	0.3065230230794079
[2]
Positive tweet
