In [None]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from nltk import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import nltk
import matplotlib.pyplot as plt
import re
from sklearn.metrics import plot_confusion_matrix

In [None]:
cols = ['target','id','date','flag','user','text']
data= pd.read_csv("C:/Users/HP/Desktop/Sentiment_analysis/Sentiment_analysis.csv",encoding='ISO-8859-1',names=cols)
data.head()

In [None]:
#total of 1.6M tweets
data.shape

In [None]:
data.columns

In [None]:
#removing unnecessary columns
data=data.drop(['id','date','flag','user'],axis=1)
data

In [None]:
cols=['text','target']
data=data.reindex(columns=cols)
data

In [None]:
#0=Negative, 4=Positive
data['target'].unique()

In [None]:
#Replace 4 with 1
data['target'] = data['target'].replace(4,1)
data['target'].unique()

In [None]:
#Checking for missing values
np.any(data.isnull())

In [None]:
#Total=1.6M tweets
data.shape

In [None]:
#drop duplicate values
data.drop_duplicates(subset='text',inplace= True)

In [None]:
#no. of tweets after dropping duplicates
data.shape

In [None]:
# Plotting the distribution for dataset.
ax = data.count().plot(kind='bar', title='Distribution of data',legend=False)
ax.set_xticklabels(['Negative','Positive'], rotation=0)
# Storing data in lists.
text, sentiment = list(data['text']), list(data['target'])

In [None]:
emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad',':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed',':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused',
          '$_$': 'greedy','@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused','<(-_-)>': 'robot', 'd[-_-]b': 'dj', 
          ":'-)": 'sadsmile',';)': 'wink',';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}

In [None]:
for emoji in emojis.keys():
            data['text'] = data['text'].replace(emoji, "EMOJI" + emojis[emoji]) 

In [None]:
stopwords =  ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an','and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do','does', 'doing', 'down', 'during', 'each','few', 'for', 'from', 
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here','hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma','me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're','s', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them','themselves', 'then', 'there', 'these', 'they', 'this', 'those', 
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was', 'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre","youve", 'your', 'yours', 'yourself', 'yourselves']

In [None]:
my_stop_words = ENGLISH_STOP_WORDS.union(["stopwords"])

In [None]:
#Pre-processing data
data['text']=data['text'].str.lower()
data['text'].head()

In [None]:
def cleaning_repeating_char(text):
    return re.sub(r'(.)1+', r'1', text)
data['text'] = data['text'].apply(lambda x: cleaning_repeating_char(x))
data['text'].head()

In [None]:
def cleaning_username(data):
    return re.sub('@[^\s]+',' ', data)
data['text'] = data['text' ].apply(lambda x: cleaning_username(x))
data['text'].head()

In [None]:
def cleaning_URLs(data):
    return re.sub('((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)',' ',data)
data['text'] = data['text'].apply(lambda x: cleaning_URLs(x))
data['text'].head()

In [None]:
english_punctuations = string.punctuation
punctuations_list = english_punctuations
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)
data['text']= data['text'].apply(lambda x: cleaning_punctuations(x))
data['text'].head()

In [None]:
def cleaning_numbers(data):
    return re.sub('[0-9]+', ' ', data)
data['text'] = data['text'].apply(lambda x: cleaning_numbers(x))
data['text'].head()

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
data['text'] = data['text'].apply(tokenizer.tokenize)
data['text'].head()

In [None]:
data.head(10)

In [None]:
data['text']=[" ".join(text) for text in data['text'].values]
data['text'].values

In [None]:
lm = nltk.WordNetLemmatizer()
def lemmatizer_on_text(data):
    text = [lm.lemmatize(word) for word in data]
    return data
data['text'] = data['text'].apply(lambda x: lemmatizer_on_text(x))
data['text'].head()   

In [None]:
X=data['text'].values
y=data['target'].values

In [None]:
X_train,X_test,y_train, y_test= train_test_split(X,y,test_size=0.1,random_state=53)
X_train,X_test, y_train, y_test

In [None]:
vect = TfidfVectorizer(stop_words= my_stop_words,ngram_range=(1,2), max_features=500000)

In [None]:
X=data['text'].values
y=data['target'].values

In [None]:
X_train = vect.fit_transform(X_train)
X_test  = vect.transform(X_test)

In [None]:
LRmodel = LogisticRegression(C=2,max_iter = 1000, n_jobs=-1)
LRmodel.fit(X_train, y_train)

In [None]:
y_pred = LRmodel.predict(X_test)

In [None]:
plot_confusion_matrix(LRmodel,X_test,y_test,display_labels=["Negative","Positive"])

In [None]:
accuracy= accuracy_score(y_pred,y_test)
accuracy