# Importing Libraries

In [1]:
import numpy as numpy
import pandas as pd

To visualize the data in the project

In [2]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [3]:
import sklearn
from sklearn.utils import shuffle 
from sklearn.feature_extraction.text import TfidfVectorizer

NLP Preprocessing libraries

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [5]:
import re
import random
import gensim

In [6]:
from collections import Counter
import unicodedata as udata
import string

checking the versions 

In [7]:
print(sklearn.__version__)
print(matplotlib.__version__)
print(numpy.__version__)
print(pd.__version__)
print(nltk.__version__)

# Reading csv file

In [8]:
df = pd.read_csv("../input/dataset-demo/TwitterSentimentAnalysis.csv", encoding='latin-1', header=None)

Shuffling the data in the data frame

In [9]:
df = df.sample(frac=1).reset_index(drop=True)

# Give column names

Assigning the Columns name to the data

In [10]:
df.columns = ["sentiment", "id", "date", "query", "user", "text"] 

In [11]:
df.columns

Checking Null values in the dataset. Here we are counting each cloumn null values in the data set

In [12]:
df.isnull().sum()

Checking the duplicates values and counting duplicates in the data set

In [13]:
df.duplicated().sum()

get the first 5 rows from the dataframe

In [14]:
df.head(5)

drop some column from the dataframe 

In [15]:
df = df.drop(["id", "date", "query", "user"], axis = 1) 

In [16]:
df.head(5)

 count the number of sentiments with respect to their tweet (4 stands for positive tweet and 0 stands for negative tweet)

In [17]:
df.sentiment.value_counts()












# Cleaning data

add new column pre_clean_len to dataframe which is length of each tweet

In [18]:
df['pre_clean_len'] = [len(t) for t in df.text]

Finding outliers using Box plot using pre_clean_len column

In [19]:
plt.boxplot(df.pre_clean_len)
plt.show()

check for any tweets greater than 140 characters

In [20]:
df[df.pre_clean_len > 140].head(10) 

At this stage, if you want you could remove these outlier tweets

# Cleaning operations

#Importing beautiful soup
#remove @ mentions from tweets
#remove URLs from tweets
#converting words like isn't to is not
#get only text from the tweets 
#remove utf-8-sig code
#converting all into lower case
#will replace non-alphabetic characters by space
#Word Punct Tokenize and only consider words whose length is greater than 1
#join the words

In [21]:
import re
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer
tok = WordPunctTokenizer()

pat1 = r'@[A-Za-z0-9_]+'        # remove @ mentions from tweets
pat2 = r'https?://[^ ]+'        # remove URLs from tweets
combined_pat = r'|'.join((pat1, pat2)) #addition of pat1 and pat2
www_pat = r'www.[^ ]+'         # remove URLs from tweets
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",   # converting words like isn't to is not
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')

def tweet_cleaner(text):  # define tweet_cleaner function to clean the tweets
    soup = BeautifulSoup(text, 'lxml')    # create beautiful soup object
    souped = soup.get_text()   # get only text from the tweets 
    try:
        bom_removed = souped.decode("utf-8-sig").replace(u"\ufffd", "?")    # remove utf-8-sig code
    except:
        bom_removed = souped
    stripped = re.sub(combined_pat, '', bom_removed) # calling combined_pat
    stripped = re.sub(www_pat, '', stripped) #remove URLs
    lower_case = stripped.lower()      # converting all into lower case
    neg_handled = neg_pattern.sub(lambda x: negations_dic[x.group()], lower_case) # converting words like isn't to is not
    letters_only = re.sub("[^a-zA-Z]", " ", neg_handled)       # will replace # by space
    words = [x for x  in tok.tokenize(letters_only) if len(x) > 1] # Word Punct Tokenize and only consider words whose length is greater than 1
    return (" ".join(words)).strip() # join the words

In [22]:
#Note that we have 1600000 instances. But processing so many instances will take a very very long time.
#Hence, restricting to rather 50000 instances.
limit=1600000
import time; 
ms = time.time()
#nums = [0,400000,800000,1200000,1600000] # used for batch processing tweets
#nums = [0, 9999]
clean_tweet_texts = [] # initialize list
for i in range(0,limit): # batch process 1.6 million tweets 
    if i % 10000==0:
        print(i, time.time()-ms)
    clean_tweet_texts.append(tweet_cleaner(df['text'][i]))  # call tweet_cleaner function and pass parameter as all the tweets to clean the tweets and append cleaned tweets into clean_tweet_texts list

# clean_tweet_texts

In [23]:
nltk.download('punkt')

tokenize word in clean_tweet_texts and append it to word_tokens list

In [24]:
word_tokens = [] # initialize list for tokens
for word in clean_tweet_texts:  # for each word in clean_tweet_texts
    word_tokens.append(word_tokenize(word)) #tokenize word in clean_tweet_texts and append it to word_tokens list

# Lemmatizing

In [25]:
nltk.download('wordnet')

In [26]:
df1 = [] # initialize list df1 to store words after lemmatization
from nltk.stem import WordNetLemmatizer # import WordNetLemmatizer from nltk.stem
lemmatizer = WordNetLemmatizer() # create an object of WordNetLemmatizer
for l in word_tokens: # for loop for every tokens in word_token
    b = [lemmatizer.lemmatize(q) for q in l] #for every tokens in word_token lemmatize word and giev it to b
    df1.append(b) #append b to list df1

# df

In [27]:
clean_df1 =[] # initialize list clean_df1 to join word tokens after lemmatization
for c in df1:  # for loop for each list in df1
    a = " ".join(c) # join words in list with space in between and give it to a
    clean_df1.append(a) # append a to clean_df1

# clean_df1

convert clean_tweet_texts into dataframe and name it as clean_df

In [28]:
clean_df = pd.DataFrame(clean_df1,columns=['text']) # convert clean_tweet_texts into dataframe and name it as clean_df
#clean_df['target'] = df.sentiment[:10000] # from earlier dataframe get the sentiments of each tweet and make a new column in clean_df as target and give it all the sentiment score
#clean_df

In [29]:
clean_df['clean_len'] = [len(t) for t in clean_df.text] # Again make a new coloumn in the dataframe and name it as clean_len which 

In [30]:
clean_df[clean_df.clean_len > 140].head(10) # again check if any tweet is more than 140 characters

In [31]:
target2 = [] # initialize list
for i in range(0,limit): # batch process 1.6 million tweets 
    target2.append(df['sentiment'][i])
clean_df['target']=target2
df.head()

In [32]:
X = clean_df.text # get all the text in x variable
y = clean_df.target # get all the sentiments into y variable
print(X.shape) #print shape of x
print(y.shape) # print shape of y
from collections import Counter
print(set(y)) # equals to list(set(words))
print(Counter(y).values()) #

# perform train and test split

X_train is the tweets of training data, X_test is the testing tweets which we have to predict, y_train is the sentiments of tweets in the traing data and y_test is the sentiments of the tweets  which we will use to measure the accuracy of the model

In [33]:
from sklearn.model_selection  import train_test_split #from sklearn.cross_validation import train_test_split to split the data into training and tesing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state= 1) # split the data into traing and testing set where ratio is 80:20

In [34]:
y_train = [x if x==0 else 1 for x in y_train.tolist()]
y_test = [x if x==0 else 1 for x in y_test.tolist()]
X_train = X_train.tolist()
X_test = X_test.tolist()

In [35]:
import numpy as np
X_train= np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

#Get Tf-idf object and save it as vect. We can select features from here we just have simply change 
#the ngram range to change the features also we can remove stop words over here with the help of stop parameter

In [36]:
#TF-IDF algoerithm
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer  = TfidfVectorizer(analyzer = "word", ngram_range=(1,3))
vectorizer.fit(X_train)
X_train_tfidf = vectorizer.transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Naive Bayes Algorithm

In [37]:
from sklearn.naive_bayes import MultinomialNB # import Multinomial Naive Bayes model from sklearn.naive_bayes
model_naive = MultinomialNB() 
model_naive.fit(X_train_tfidf, y_train)

In [38]:
from sklearn.model_selection import cross_val_score  # import cross_val_score from sklear.model_selection
accuracies = cross_val_score(estimator = model_naive, X = X_train_tfidf, y = y_train, cv = 10) # do K- fold cross validation on our traing data and its sentimenst with 10 fold cross validation
accuracies.mean() # measure the mean accuray of 10 fold cross validation

In [39]:
y_pred_ = model_naive.predict(X_test_tfidf) 

In [40]:
#Accuracy
from sklearn import metrics # import metrics from sklearn
metrics.accuracy_score(y_test, y_pred_) 

In [41]:
#Confucion matrix
from sklearn.metrics import classification_report,confusion_matrix # import confusion matrix from the sklearn.metrics
confusion_matrix(y_test, y_pred_)

In [42]:
#classification report
print(classification_report(y_test, y_pred_))

In [43]:

import pickle
pickle.dump(vectorizer, open("tfidf1.pickle", "wb"))

In [62]:
#Save Niave Base algorithm
import pickle
pickle.dump(model_naive, open("naive.pickle", "wb"))

# Logistic Regression

In [45]:
from sklearn.linear_model import LogisticRegression # import Logistic Regression model from sklearn.linear_model
logisticRegression = LogisticRegression(solver='lbfgs',max_iter=1000)

In [46]:
#Train the logistic regresion
lg_model = logisticRegression.fit(X_train_tfidf, y_train)

In [48]:
#Testing prdc=iction
y_pred_lg = lg_model.predict(X_test_tfidf) 

In [49]:
#confysion matrix
confusion_matrix(y_pred_lg, y_pred_)

In [50]:
print(classification_report(y_pred_lg, y_pred_))

In [61]:
#save Logistic Regression model
pickle.dump(lg_model, open("lg_model.pickle", "wb"))

# Support Vector Machine 

In [55]:
from sklearn.svm import LinearSVC # import SVC model from sklearn.svm
svm = LinearSVC(random_state=0) # get object of SVC model with random_state parameter = 0

In [56]:
#SVM train
svm_model = svm.fit(X_train_tfidf, y_train)

In [58]:
#Prediction
y_pred_svm = svm_model.predict(X_test_tfidf)

In [59]:
#confysion matrix
confusion_matrix(y_pred_svm, y_pred_)

In [60]:
#Confusion matrix
print(classification_report(y_pred_svm, y_pred_))

In [63]:
#save Logistic Regression model
pickle.dump(svm_model, open("svm_model.pickle", "wb"))