# Coronavirus tweets NLP_CLASSIFICATION

The objective of this task is to detect sentimaents in tweets. For the sake of simplicity, we say a tweet contains sentiments which is classified as Negative,Positive ,Extremely Positive ,Extremely Negative and Neutral. So, the task is to classify the tweets according to them.

### STEPS INVOLVED

#### 1.Importing dependencies
#### 2.Exploratory data Analysis (Text and Sentiment)
#### 3.Preprocessing and cleaning text data
#### 4.Tokenisation and  lemmatisation 
#### 5.Feature Extraction
#### 6.Dividing data into training and test data sets
#### 7.Model Building
#### 8.Evaluation of Model
#### 9.Model Testing System

## IMPORTING DEPENDENCIES

In [1]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import re
from sklearn.model_selection import train_test_split
import nltk
import string
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, accuracy_score
from sklearn.svm import SVC
from wordcloud import WordCloud
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
# changing the directory
os.chdir("D:\\NLP")

In [3]:
# importing the csv file
df1=pd.read_csv("Corona_NLP1.csv",encoding='latin1')
df1.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [4]:
# importing second file
df2=pd.read_csv("Corona_NLP2.csv",encoding='latin1')
df2.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


In [5]:
print(df1.shape)
print(df2.shape)

(41157, 6)
(3798, 6)


In [6]:
# Merging two csv files
df=pd.concat([df1, df2])
print(df.shape)
df.head()

(44955, 6)


Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [None]:
# Information
df.info()

In [None]:
# Dropping the other columns i.e UserName,ScreenName,Location,TweetAT as they are not useful
df.drop(columns=['UserName','ScreenName','Location','TweetAt'],inplace=True)
df.head()

In [None]:
# Checking Null values in data
df.isnull().sum()

## EXPLORATORY DATA ANALYSIS

In [None]:
# Plotting the distribution of column Sentiment
plt.figure(figsize=(10,5))
sns.countplot(x=df['Sentiment'])
plt.show()

In [None]:
plt.figure(figsize=(8,8))
df.Sentiment.value_counts().plot(kind='pie', autopct='%1.0f%%')
plt.show()

In [None]:
# Length of each text in Sentiment Column
df['OriginalTweet'].str.len()

In [None]:
count_lettr=df['OriginalTweet'].str.len() - df['OriginalTweet'].str.count(' ')
count_lettr.describe()

In [None]:
# Distribution of length of Sentiment Column (With considering spaces)
df['OriginalTweet'].str.len().hist()

In [None]:
# Distribution of length of Sentiment Column (Without considering spaces)
count_lettr.hist()

## PREPROCESSING AND CLEANING TEXT DATA

In [None]:
# removes pattern in the input text
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for word in r:
        input_txt = re.sub(word,"", input_txt)
    return input_txt 

# removing the URL
def remove_URL(headline_text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', headline_text)

# removing the punctuations
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, " ")
    return text

# removing ASCII characters
def encoded(data):
    encoded_string = data.encode("ascii", "ignore")
    return encoded_string.decode()

# removing irrelevant characters
def reg(data):
    regex = re.compile(r'[\r\n\r\n]')
    return re.sub(regex, '', data)

#removing multi spaces
def spaces(data):
    res = re.sub(' +', ' ',data)
    return res

# removing emojis
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

df['clean_t']=np.vectorize(remove_pattern)(df['OriginalTweet'],'@[\w]*') #takes nested sequence of objects or numpy ararys

df["clean_t"]=df["clean_t"].apply(remove_URL)
df['clean_t'] = df['clean_t'].apply(remove_punctuations)
df['clean_t'] = df['clean_t'].apply(encoded)
df['clean_t'] = df['clean_t'].str.replace("[^a-zA-Z]", " ")    # removing the numeric characters
df["clean_t"]=df["clean_t"].str.lower()                        # to convert into lower case
df['clean_t'] = df['clean_t'].apply(reg) 
df['clean_t']=df['clean_t'].apply(spaces)
df['clean_t'] = df['clean_t'].apply(remove_emojis)

df.head(10)

In [None]:
# Removing Short words having length more than 3
df['clean_t'] = df['clean_t'].apply(lambda x: " ".join([w for w in x.split() if len(w)>3]))
df.head(10)

In [None]:
# Removing irrelevant words in clean_t column
words = set(nltk.corpus.words.words())

def clean_sent(sent):
    return " ".join(w for w in nltk.wordpunct_tokenize(sent) \
     if w.lower() in words or not w.isalpha())

df['clean_t'] = df['clean_t'].apply(clean_sent)

In [None]:
df.head(10)

# Tokenisation

In [None]:
# Converting text into Tokens
from nltk.tokenize import word_tokenize
df['clean_t'] = df['clean_t'].apply(lambda x: nltk.word_tokenize(x))

In [None]:
# Removing Stop Words
from nltk.corpus import stopwords
s = set(stopwords.words('english'))
# Removing Stopwords
def remove_stopwords(data):
    txt_clean=[w for w in data if w not in s]
    return txt_clean
df['clean_t']=df['clean_t'].apply(lambda x : remove_stopwords(x))
df['clean_t'].head(10)

In [None]:
'''# Stemming
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
df['clean_t']= df['clean_t'].apply(lambda x: [stemmer.stem(word) for word in x])
df.head()'''

In [None]:
# lemmatizing
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
df['clean_t'] = df['clean_t'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
df.head(10)

In [None]:
df['clean_t'].head(50)

In [None]:
# Converting Extremely Negative and Extremely Positive to Negative Nand Positive
df.loc[df['Sentiment']=="Extremely Positive", 'Sentiment'] = "Positive"
df.loc[df['Sentiment']=="Extremely Negative", 'Sentiment'] = "Negative"
df.head()

# WORD CLOUD

In [None]:

wordcloud = WordCloud(width=800, height=500, random_state=42, max_font_size=100,background_color='black').generate(str(df['clean_t']))
# plot the graph
plt.figure(figsize=(15,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
df['Sentiment'].unique()

In [None]:
# Wordcloud based on Sentiments
df_neutral=df[df.Sentiment == "Neutral"]
df_Positve=df[df.Sentiment == "Positive"]
df_Negative=df[df.Sentiment == 'Negative']

In [None]:
from wordcloud import WordCloud
wordcloud = WordCloud(width=800, height=500, random_state=42, max_font_size=100).generate(str(df_neutral['clean_t']))
# plot the graph
plt.figure(figsize=(15,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
from wordcloud import WordCloud
wordcloud = WordCloud(width=800, height=500, random_state=42, max_font_size=100).generate(str(df_Positve['clean_t']))
# plot the graph
plt.figure(figsize=(15,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
from wordcloud import WordCloud
wordcloud = WordCloud(width=800, height=500,background_color='white', random_state=42, max_font_size=100).generate(str(df_Negative['clean_t']))
# plot the graph
plt.figure(figsize=(15,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# Plotting the distribution of column Sentiment
plt.figure(figsize=(5,5))
sns.countplot(x=df['Sentiment'])
plt.show()

In [None]:
# Converting Categorical values into Numerical Vaues
dict = {'Positive' : 1, 'Negative' : -1 , 'Neutral' : 0}
  
# Print the dictionary
print(dict)
  
# Remap the values of the dataframe
df.replace({"Sentiment": dict},inplace=True)

## Dividing Data into Training and Test Sets

In [None]:
x=df['clean_t']
y=df['Sentiment']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=42, test_size=0.3,stratify=y)

In [None]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

In [None]:
x_train = pd.DataFrame(x_train)
type(x_train)
x_test = pd.DataFrame(x_test)
type(x_test)

# Feature extraction

In [None]:
# Using Bag of Words Technique we are going to convert text into numeric form
from sklearn.feature_extraction.text import CountVectorizer
bow_vec = CountVectorizer(lowercase=False)
x_train['clean_t']=x_train['clean_t'].apply(str)
bow1 = bow_vec.fit_transform(x_train['clean_t'])

In [None]:
x_test['clean_t']=x_test['clean_t'].apply(str)
bow2=bow_vec.transform(x_test['clean_t'])

In [None]:
x_test=pd.DataFrame(bow2.toarray())
x_test

In [None]:
x_train=pd.DataFrame(bow1.toarray())
x_train

# Model Building

We build model by using Bag of words and TD-IDF techiniques
Here we are building 4 models ,Computing accuracies of these all models we will select best accuracy model
1.Random Forest Classifier
2.Decision TreeClassifer
3.Logistic Regression
4.Naive_bayes 

## Model Buidling Using Bag of Words Technique

### 1.Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
model1=RandomForestClassifier()
model1.fit(x_train, y_train)

### 2.Decision Tree Classifier

In [None]:
model2= DecisionTreeClassifier()
model2.fit(x_train, y_train)

### 3.Logistic Regression

In [None]:
model3=LogisticRegression( )
model3.fit(x_train, y_train)

## Model Testing

In [None]:
# testing
pred1 = model1.predict(x_test)
pred2 = model2.predict(x_test)
pred3 = model3.predict(x_test)

In [None]:
# Model Evaluation
print("---------------------------Model 1(RFC)--------------------")
print(classification_report(y_test, pred1))
print("---------------------------Model 2(DTC)--------------------")
print(classification_report(y_test, pred2))
print("---------------------------Model 3(LR)--------------------")
print(classification_report(y_test, pred3))

We can see that model3 i.e Logistic Regression has performed well

## Model Buidling Using TF-IDF Technique

In [None]:
'''from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(lowercase=False)
df['clean_t']=df['clean_t'].apply(str)
tfid = vectorizer.fit_transform(df['clean_t'])
tfid'''

### Dividing Data into Training and Test data sets

In [None]:
'''x_train1, x_test1, y_train1, y_test1 = train_test_split(tfid,df['Sentiment'],random_state=42, test_size=0.3,stratify=df['Sentiment'])'''

## Random Forest Classifier

In [None]:
'''model4=RandomForestClassifier()
model4.fit(x_train1, y_train1)'''

## Decision Tree Classifier`

In [None]:
'''model5= DecisionTreeClassifier()
model5.fit(x_train1, y_train1)'''

## Logistic Regression

In [None]:
'''model6=LogisticRegression()
model6.fit(x_train1, y_train1)'''

In [None]:
'''pred4= model4.predict(x_test1)
pred5= model5.predict(x_test1)
pred6= model6.predict(x_test1)'''

In [None]:
'''print("---------------------------Model 1(RFC)--------------------")
print(classification_report(y_test1, pred4))
print("---------------------------Model 2(DTC)--------------------")
print(classification_report(y_test1, pred5))
print("---------------------------Model 3(LR)--------------------")
print(classification_report(y_test1, pred6))'''

# Model Testing

In [None]:
from sklearn.metrics import confusion_matrix
mat=confusion_matrix(y_test,pred3)
s=accuracy_score(y_test,pred3)
print("Accuracy Score of model3 by using Bag of words",round(s*100,2))

## Saving the Model and vectorizer

In [None]:
import pickle
pickle.dump(model3,open('model.pkl',"wb"))
#model=pickle.load(open('model.pkl',"rb"))

In [None]:
pickle.dump(bow_vec,open('bow.pkl',"wb"))

In [None]:
# Defining Function
data = ["For corona prevention,we should stop to buy things with the cash and should use online payment methods because corona can spread through the notes. Also we should prefer online shopping from our home. It's time to fight against COVID 19?. #govindia #IndiaFightsCorona"]
def remove_URL(headline_text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', headline_text)

def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for word in r:
        input_txt = re.sub(word,"", input_txt)
    return input_txt

# removing the punctuations
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, " ")
    return text

# removing ASCII characters
def encoded(data):
    encoded_string = data.encode("ascii", "ignore")
    return encoded_string.decode()

# removing irrelevant characters
def reg(data):
    regex = re.compile(r'[\r\n\r\n]')
    return re.sub(regex, '', data)

#removing multi spaces
def spaces(data):
    res = re.sub(' +', ' ',data)
    return res

# removing emojis
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

# Removing irrelevant words in clean_t column
words = set(nltk.corpus.words.words())

def clean_sent(sent):
    return " ".join(w for w in nltk.wordpunct_tokenize(sent) \
     if w.lower() in words or not w.isalpha())

# Removing Stopwords
def remove_stopwords(data):
    txt_clean=[w for w in data if w not in s]
    return txt_clean


def predict(input_text):
    data_frame=pd.DataFrame([input_text],columns=['text'])
    data_frame['text'] = np.vectorize(remove_pattern)(data_frame['text'],'@[\w]*')
    data_frame['text'] = data_frame["text"].apply(remove_URL)
    data_frame['text'] = data_frame['text'].apply(remove_punctuations)
    data_frame['text'] = data_frame['text'].str.replace("[^a-zA-Z]", " ")    # removing the numeric characters
    data_frame['text'] = data_frame['text'].str.lower()                        # to convert into lower case
    data_frame['text'] = data_frame['text'].apply(reg) 
    data_frame['text'] = data_frame['text'].apply(spaces)
    data_frame['text'] = data_frame['text'].apply(remove_emojis)
    data_frame['text'] = data_frame['text'].apply(lambda x: " ".join([w for w in x.split() if len(w)>3]))
    data_frame['text'] = data_frame['text'].apply(clean_sent)
    data_frame['text'] = data_frame['text'].apply(lambda x: nltk.word_tokenize(x)) 
    data_frame['text'] = data_frame['text'].apply(lambda x: remove_stopwords(x))
    data_frame['text'] = data_frame['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
    data_frame['text'] = data_frame['text'].apply(str)
    bow1 = bow_vec.transform(data_frame['text'])
    final = pd.DataFrame(bow1.toarray())
    my_prediction = model3.predict(final)
    if my_prediction == 0 :
        print("Neutral")
    elif my_prediction== -1:
        print("Negative")
    else:
        print("Positive")
     

In [None]:
predict("For corona prevention,we should stop to buy things with the cash and should use online payment methods because corona can spread through the notes. Also we should prefer online shopping from our home. It's time to fight against COVID 19?. #govindia #IndiaFightsCorona")