# Name - Ashish Dinesh Patil

In [1]:
import numpy as np   # Numpy Library for Statistics
import pandas as pd   # Pandas library for DataFrame
import re   # Regular Expression for matching patterns
from nltk.corpus import stopwords   # Stopwords - To remove words with less meaning
from nltk.tokenize import word_tokenize   # Word Tokenizer
from nltk.stem import PorterStemmer   # Stemming 
from sklearn.feature_extraction.text import CountVectorizer   # Bag of Words
from sklearn.feature_extraction.text import TfidfVectorizer   # Term Freq. and Inverse Document Freq.
from sklearn.model_selection import train_test_split   # For training and spliting (80%-20%)
from sklearn.naive_bayes import MultinomialNB    # Naive Byes ML Model
from sklearn.metrics import accuracy_score   # Analysis

In [2]:
twitter_data = pd.read_csv('Sentiment.csv')
twitter_data = pd.DataFrame(twitter_data)

print(twitter_data.shape)
twitter_data.head()

(13871, 3)


Unnamed: 0,id,sentiment,text
0,1,Neutral,RT @NancyLeeGrahn: How did everyone feel about...
1,2,Positive,RT @ScottWalker: Didn't catch the full #GOPdeb...
2,3,Neutral,RT @TJMShow: No mention of Tamir Rice and the ...
3,4,Positive,RT @RobGeorge: That Carly Fiorina is trending ...
4,5,Positive,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...


In [3]:
twitter_data.info() # Information related to Data Frame

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13871 entries, 0 to 13870
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         13871 non-null  int64 
 1   sentiment  13871 non-null  object
 2   text       13871 non-null  object
dtypes: int64(1), object(2)
memory usage: 325.2+ KB


In [4]:
twitter_data.describe() # Describes Data Frame

Unnamed: 0,id
count,13871.0
mean,6936.0
std,4004.357127
min,1.0
25%,3468.5
50%,6936.0
75%,10403.5
max,13871.0


In [5]:
twitter_data.sentiment.value_counts() # Sentiment Counts

Negative    8493
Neutral     3142
Positive    2236
Name: sentiment, dtype: int64

In [6]:
# Replace sentiment 'Positive' with '1' and 'Negative' with '0' 
# twitter_data.sentiment.replace('Positive',1,inplace=True)
# twitter_data.sentiment.replace('Negative',0,inplace=True)
# twitter_data.head(10)

In [7]:
# Function to convert upper case to lower case
def toLower(text):
    return text.lower()

In [8]:
twitter_data.text = twitter_data.text.apply(toLower)
twitter_data.text[0]

'rt @nancyleegrahn: how did everyone feel about the climate change question last night? exactly. #gopdebate'

In [9]:
# Function to clean data using regular expressions
def clean(text):
    regex1 = re.compile(r'rt @.*?: ')
    cleaned1 = re.sub(regex1,'',text)
    regex2 = re.compile(r'@\w+')
    cleaned2 = re.sub(regex2,'',cleaned1)
    regex3 = re.compile(r'#\w+')
    cleaned3 = re.sub(regex3,'',cleaned2)
    regex4 = re.compile(r'http://t.co/\w+')
    return re.sub(regex4,'',cleaned3)

In [10]:
twitter_data.text = twitter_data.text.apply(clean)
twitter_data.text[0]

'how did everyone feel about the climate change question last night? exactly. '

In [11]:
# Function to remove special characters
def isSpecial(text):
    rem = ''
    for i in text:
        if i.isalnum():
            rem = rem + i
        else:
            rem = rem + ' '
    return rem

In [12]:
twitter_data.text = twitter_data.text.apply(isSpecial)
twitter_data.text[0]

'how did everyone feel about the climate change question last night  exactly  '

In [13]:
# Function to remove stopwords
def removeStopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    return [w for w in words if w not in stop_words]

In [14]:
twitter_data.text = twitter_data.text.apply(removeStopwords)
twitter_data.text[0]

['everyone',
 'feel',
 'climate',
 'change',
 'question',
 'last',
 'night',
 'exactly']

In [15]:
twitter_data.text[13867]

['thought', 'huckabee', 'exceeded', 'expectations', 'http']

In [16]:
# Apply Stemming
def stemming(text):
    ps = PorterStemmer()
    return " ".join([ps.stem(w) for w in text])

In [17]:
twitter_data.text = twitter_data.text.apply(stemming)
twitter_data.text[0]

'everyon feel climat chang question last night exactli'

In [18]:
# Bag of Words Creation
reviews = np.array(twitter_data.iloc[:,0].values)
sentiments = np.array(twitter_data.sentiment.values)
cv = CountVectorizer(max_features = 1000) # Bag of words count vectorizer
cv_reviews = cv.fit_transform(twitter_data.text).toarray()
print("reviews.shape = ",cv_reviews.shape)
print("sentiments.shape = ",sentiments.shape)

reviews.shape =  (13871, 1000)
sentiments.shape =  (13871,)


In [19]:
# Bag of Words Array
cv_reviews

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [20]:
# Term Frequency and Inverse Document Frequency Creation
tfidf = TfidfVectorizer()
tfidf_reviews = tfidf.fit_transform(twitter_data.text).toarray()

In [21]:
# Term Frequency and Inverse Document Frequency Creation Array 
tfidf_reviews

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [22]:
# Train Test And Split On Bag of Words
cv_reviews_train,cv_reviews_test,sentiments_train,sentiments_test = train_test_split(cv_reviews, sentiments, test_size=0.2,random_state=9)

In [23]:
cv_reviews_train.shape

(11096, 1000)

In [24]:
cv_reviews_test.shape

(2775, 1000)

In [25]:
sentiments_train.shape

(11096,)

In [26]:
sentiments_test.shape

(2775,)

In [27]:
# Defining the models and Training
mnb = MultinomialNB(alpha=1.0,fit_prior=True)
mnb.fit(cv_reviews_train,sentiments_train)

MultinomialNB()

In [28]:
# Prediction and accuracy metrics of Model
sentiments_predicted=mnb.predict(cv_reviews_test)

In [29]:
print("Accuracy of Model Using Count Vectorizer And Stemming = ",accuracy_score(sentiments_test,sentiments_predicted))

Accuracy of Model Using Count Vectorizer And Stemming =  0.6299099099099099


In [30]:
# Train Test And Split On TF and IDF
tfidf_reviews_train,tfidf_reviews_test,sentiments_train,sentiments_test = train_test_split(tfidf_reviews, sentiments, test_size=0.2,random_state=9)

In [31]:
tfidf_reviews_train.shape

(11096, 8066)

In [32]:
tfidf_reviews_test.shape

(2775, 8066)

In [33]:
sentiments_train.shape

(11096,)

In [34]:
sentiments_test.shape

(2775,)

In [35]:
# Defining the models and Training
mnb = MultinomialNB(alpha=1.0,fit_prior=True)
mnb.fit(tfidf_reviews_train,sentiments_train)

MultinomialNB()

In [36]:
# Prediction and accuracy metrics of Model
sentiments_predicted=mnb.predict(tfidf_reviews_test)

In [37]:
print("Accuracy of Model Using TF, IDF and Stemming = ",accuracy_score(sentiments_test,sentiments_predicted))

Accuracy of Model Using TF, IDF and Stemming =  0.6594594594594595
