In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer #ml library, convert text data to numeric data
from sklearn.model_selection import train_test_split #split data into training and test data
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score # accuracy of ml model


In [2]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\archa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
#print stopwords in english
print(stopwords.words('english'))#not required for processing

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [5]:
#remove stopwords from tweet , to reduce size of data
#Data Processing
#load the data from csv to pandas dataframe
twitter_data=pd.read_csv("training.1600000.processed.noemoticon.csv",encoding='ISO-8859-1')
                                                                    #encoding latin words

In [6]:
#check no. of rows,columns
twitter_data.shape

(9999, 6)

In [7]:
#print first 5 rows of data
twitter_data.head()#first column is read as 0th column name

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [8]:
#name the columns and reading dataset again
column_names=['Target','ID','Date','Flag','User','Text'] #headings
twitter_data=pd.read_csv("training.1600000.processed.noemoticon.csv",names=column_names,encoding='ISO-8859-1')

In [9]:
twitter_data.shape

(10000, 6)

In [10]:
twitter_data.head()

Unnamed: 0,Target,ID,Date,Flag,User,Text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [11]:
#check for missing values
twitter_data.isnull().sum()

Target    0
ID        0
Date      0
Flag      0
User      0
Text      0
dtype: int64

In [12]:
#checking distribution of target column
twitter_data['Target'].value_counts()

Target
0    5000
4    5000
Name: count, dtype: int64

In [13]:
#replace 4 as 1 for +ve tweet
twitter_data.replace({'Target': {4: 1}}, inplace=True)

In [14]:
twitter_data['Target'].value_counts()

Target
0    5000
1    5000
Name: count, dtype: int64

In [15]:
#0=>-ve tweet , 1=>+ve tweet
###########Stemmming############ to reduce a word to its key/root word, actor/actress/acting---->act
#Stemming will reduce the word length
port_stem=PorterStemmer()



In [16]:
#def stemming(content):#tweet will be passed
#    stemmed_content=re.sub('[^a-zA-Z]',' ',content)#remove everything that's not alphabet 
 #   stemmed_content=stemmed_content.lower() #lower case convertion
  #  stemmed_content=stemmed_content.split() #split the words n put them in list
   # stemmed_content=[port_stem.stem(word) for word in stemmed_content if not word in stopwords.word('english')]
    #if word dosent belong to stopword then process in stemming_content
    #stemmed_content=' '.join(stemmed_content) #join words back from tweet

    #return stemmed_content

In [17]:

# Initialize stemmer and stopwords
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Define a function to remove special characters and punctuation
def remove_special_characters(text):
    return re.sub(r'[^a-zA-Z\s]', '', text)

# Define a processing function that combines all steps
def process_text(text):
    text = remove_special_characters(text)  # Remove special characters and punctuation
    words = text.split()  # Split the text into words
    stemmed_words = [stemmer.stem(word) for word in words if word.lower() not in stop_words]  # Apply stemming to words that are not stopwords
    return ' '.join(stemmed_words)  # Join the stemmed words back into a single string

# Apply the processing function to the 'Text' column
twitter_data['stemmed_content'] = twitter_data['Text'].apply(process_text)

In [18]:
twitter_data.head()

Unnamed: 0,Target,ID,Date,Flag,User,Text,stemmed_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot httptwitpiccomyzl awww that bummer ...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset cant updat facebook text might cri resul...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav im mad cant see


In [19]:
print(twitter_data['stemmed_content'])

0       switchfoot httptwitpiccomyzl awww that bummer ...
1       upset cant updat facebook text might cri resul...
2       kenichan dive mani time ball manag save rest g...
3                         whole bodi feel itchi like fire
4                   nationwideclass behav im mad cant see
                              ...                        
9995                           unolk case stop watch play
9996    mitchelmusso amazingli nice cant wait see actu...
9997          get readi go home soon go see ladi tomorrow
9998    luva mah cuzzo make wors day better way fam us...
9999    miafreedman friend bag read recent love tri se...
Name: stemmed_content, Length: 10000, dtype: object


In [20]:
print(twitter_data['stemmed_content'])

0       switchfoot httptwitpiccomyzl awww that bummer ...
1       upset cant updat facebook text might cri resul...
2       kenichan dive mani time ball manag save rest g...
3                         whole bodi feel itchi like fire
4                   nationwideclass behav im mad cant see
                              ...                        
9995                           unolk case stop watch play
9996    mitchelmusso amazingli nice cant wait see actu...
9997          get readi go home soon go see ladi tomorrow
9998    luva mah cuzzo make wors day better way fam us...
9999    miafreedman friend bag read recent love tri se...
Name: stemmed_content, Length: 10000, dtype: object


In [21]:
print(twitter_data['Target'])


0       0
1       0
2       0
3       0
4       0
       ..
9995    1
9996    1
9997    1
9998    1
9999    1
Name: Target, Length: 10000, dtype: int64


In [22]:
#separting the data and label
x=twitter_data['stemmed_content'].values #save tweet
y=twitter_data['Target'].values 

In [23]:
print(x)

['switchfoot httptwitpiccomyzl awww that bummer shoulda got david carr third day'
 'upset cant updat facebook text might cri result school today also blah'
 'kenichan dive mani time ball manag save rest go bound' ...
 'get readi go home soon go see ladi tomorrow'
 'luva mah cuzzo make wors day better way fam us r attach hip'
 'miafreedman friend bag read recent love tri see think']


In [24]:
print(y)

[0 0 0 ... 1 1 1]


In [25]:
#split data into training and test data
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2, stratify=y,random_state=2)#0.2=>20% test data and 80% training
                                #split data into train and test       stratify=>50% for +ve & 50% -ve data
#x_train=all training data tweets
#y_train for labels



In [26]:
print(x.shape,x_train.shape,x_test.shape)

(10000,) (8000,) (2000,)


In [27]:
print(x_test)

['happyahma welcom back sorri hear ant'
 'jemcam thursday noth russbak work though still hang though easter plan'
 'whoa wow follow feel sooo love' ...
 'bayachaya hehe thx could stare u foreva lol'
 'sociallycub busi hard catch uptoo cold beer mayb bourbon'
 'wind day wrestl min ipa soul search fun fun fun']


In [28]:
print(x_train)

['cleggett need quit watch tina watch greek instead'
 'nicolawil chat line yet freeview good'
 'samzon least adult make choic hope fulli amp accur inform diabet man circ'
 ... 'receiv cutest facebook messag' 'go swim'
 'clutch surpris parti fun kell live band pretti amaz im sorri miss need get togeth']


In [29]:
#converting textual data to mumerical data

vectorizer=TfidfVectorizer() #converts text=>numeric

x_train=vectorizer.fit_transform(x_train)
x_test=vectorizer.transform(x_test)

In [30]:
print(x_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 59492 stored elements and shape (8000, 13725)>
  Coords	Values
  (0, 2194)	0.4563256680863695
  (0, 8361)	0.2192649737187022
  (0, 9660)	0.3267531512465209
  (0, 13030)	0.44253850648843734
  (0, 12140)	0.4364181299064485
  (0, 4728)	0.37262589453068623
  (0, 5915)	0.33432164600169295
  (1, 8457)	0.5330490258838495
  (1, 1970)	0.4137867640877512
  (1, 7073)	0.3816911060621504
  (1, 13579)	0.32076816288520876
  (1, 4265)	0.4932949386184833
  (1, 4625)	0.2296062687205081
  (2, 10411)	0.3402414426526845
  (2, 6941)	0.2344306207543616
  (2, 136)	0.3148666884636464
  (2, 7413)	0.1681661068545506
  (2, 2075)	0.25594833381125814
  (2, 5253)	0.16900770787412758
  (2, 4345)	0.3253981629870802
  (2, 405)	0.17372615420919751
  (2, 69)	0.3402414426526845
  (2, 5875)	0.2813230880002962
  (2, 3014)	0.3402414426526845
  (2, 7446)	0.21280009395015076
  :	:
  (7995, 4462)	0.5835628945232388
  (7996, 2797)	0.5324261494245249
  (7996, 11613)	0.

In [31]:
print(x_test)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 12326 stored elements and shape (2000, 13725)>
  Coords	Values
  (0, 518)	0.633908573548191
  (0, 860)	0.31015160871826525
  (0, 5011)	0.41029718649803737
  (0, 11182)	0.37511826054281616
  (0, 13103)	0.4392133984189193
  (1, 3443)	0.323012783017568
  (1, 4912)	0.3124274719815438
  (1, 6182)	0.44088986092806814
  (1, 8576)	0.2828193230697734
  (1, 9280)	0.30377864311837577
  (1, 11425)	0.21705736439556814
  (1, 12045)	0.494925592748902
  (1, 12082)	0.32503169305795654
  (1, 13332)	0.18354528183662366
  (2, 3960)	0.3047554210686527
  (2, 4154)	0.3228134020303214
  (2, 7254)	0.27069943217341536
  (2, 11158)	0.4447875131733955
  (2, 13189)	0.6032939321509776
  (2, 13379)	0.4096803116757101
  (3, 5075)	0.5424506510829136
  (3, 5118)	0.6119798709415727
  (3, 7254)	0.38597470915456505
  (3, 11907)	0.4269091854189927
  (4, 784)	0.3757077603852905
  :	:
  (1994, 11349)	0.5670366722153692
  (1994, 11916)	0.3200814834188279
  (1995, 1

In [32]:
######LOGISTIC REGRESSION#######
model=LogisticRegression(max_iter=1000) #max no. of times the model goes through data

In [33]:
model.fit(x_train,y_train)

In [34]:
#######Model Evaluation#######
##acuracy score on training data
x_train_prediction=model.predict(x_train)  #model has to predict the target
training_data_accuracy=accuracy_score(y_train,x_train_prediction)


In [35]:
print("Accuracy score on training data =",training_data_accuracy)

Accuracy score on training data = 0.89075


In [36]:
#accuracy score on test data
x_test_prediction=model.predict(x_test)
test_data_accuracy=accuracy_score(y_test,x_test_prediction)

In [37]:
print("Accuracy score on test data =",test_data_accuracy)

Accuracy score on test data = 0.7755


In [38]:
#Saving the trained model
import pickle
filename='trained_model.sav'
pickle.dump(model,open(filename,'wb'))

In [39]:
#new predictions using saved model
#loading saved model
import pickle
#loaded_model = pickle.load(open('C:\Users\archa\Desktop\miniproject\Text-Sentiment-Analysis\trained_model.sav', 'rb'))
loaded_model = pickle.load(open('trained_model.sav', 'rb'))




In [40]:
x_new=x_test[200] 
print(y_test[200]) #true value

prediction=loaded_model.predict(x_new)
print(prediction) #predicted value

if (prediction[0]==0):
    print("Negative tweet")
else:
    print("Positive tweet")

1
[1]
Positive tweet


In [41]:
x_new=x_test[25] 
print(y_test[25]) #true value

prediction=loaded_model.predict(x_new)
print(prediction) #predicted value

if (prediction[0]==0):
    print("Negative tweet")
else:
    print("Positive tweet")

0
[0]
Negative tweet


In [42]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
                                            #transforms text data into numerical feature 
# Assuming vectorizer is the instance used for transforming your training data
with open('vectorizer.sav', 'wb') as f: 
                             #open in binary write mode==>wb
    pickle.dump(vectorizer, f) #vectorizer objects are serialized using pickle
                           #file object where serialized data is stored
    #pickle serializes the python objects

    #serialization is converting python objects into a format that can be easily stored


In [43]:
!pip install flask_cors 
#required for middle.py

