<a href="https://colab.research.google.com/github/AnuAgni/Natural-Language-Processing/blob/main/SentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TWEET SENTIMENT ANALYSIS 
(TAKEN FROM: https://www.nbshare.io/notebook/754493525/Tweet-Sentiment-Analysis-Using-LSTM-With-PyTorch/)
Difference: Instead of undersampling text augmentation has been done.

##Import and Data loading

In [None]:
import torch                                              # Open source library and script language
import torch.nn as nn                                     # Provides all building blocks to build neural network
import torch.nn.functional as F                           # Contains varrious functions applied on input layer/signal
from torch.utils.data import DataLoader, TensorDataset    

import numpy as np                                        # Work with mathematical data
import pandas as pd                                       # Work with tabular data

import re                                                 # Provides regular expression matching operations
# Regular expression also known as regex(p) is a method used for pattern/string matching

from sklearn.model_selection import train_test_split      # Model_selection is a method for setting a blueprint to analyze data and then using it to measure new data. 
from sklearn.metrics import accuracy_score                #how accurate the model is

import nltk                                               # Work with human language data for applying in statistcial nlp
from nltk.tokenize import word_tokenize                   # Breaking text into small chunks , slipt workd into tokens

import matplotlib.pyplot as plt                            # Use for plotting i.e data visualisation

In [None]:
nltk.download('punkt')                                     # Tokenize sentence

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
train_df=pd.read_csv('https://raw.githubusercontent.com/ajayshewale/Sentiment-Analysis-of-Text-Data-Tweets-/master/data/train.csv')   # Reading raw train data from github

In [None]:
test_df=pd.read_csv('https://raw.githubusercontent.com/ajayshewale/Sentiment-Analysis-of-Text-Data-Tweets-/master/data/test.csv')     # Reading raw test data from github

##Data preprocessing

In [None]:
train_df    # Checking train data manually by printing on screen

Unnamed: 0,Id,Category,Tweet
0,635769805279248384,negative,Not Available
1,635930169241374720,neutral,IOS 9 App Transport Security. Mm need to check...
2,635950258682523648,neutral,"Mar if you have an iOS device, you should down..."
3,636030803433009153,negative,@jimmie_vanagon my phone does not run on lates...
4,636100906224848896,positive,Not sure how to start your publication on iOS?...
...,...,...,...
5965,639016598477651968,neutral,@YouAreMyArsenal Wouldn't surprise me if we en...
5966,640276909633486849,neutral,Rib injury for Zlatan against Russia is a big ...
5967,640296841725235200,neutral,Noooooo! I was hoping to see Zlatan being Zlat...
5968,641017384908779520,neutral,Not Available


In [None]:
# Clean code from inconsistent, noisy and useless information
train_df=train_df.drop(columns=["Id"])                      # Id is useless for training model
train_df=train_df.dropna()                                  # Droping null value
train_df=train_df[train_df['Tweet']!="Not Available"]       # Not available will not help in training model
train_df                                                    # Printing training data

Unnamed: 0,Category,Tweet
1,neutral,IOS 9 App Transport Security. Mm need to check...
2,neutral,"Mar if you have an iOS device, you should down..."
3,negative,@jimmie_vanagon my phone does not run on lates...
4,positive,Not sure how to start your publication on iOS?...
5,neutral,"Two Dollar Tuesday is here with Forklift 2, Qu..."
...,...,...
5963,positive,"Ok ed let's do this, Zlatan, greizmann and Lap..."
5964,neutral,Goal level: Zlatan 90k by Friday? = Posting e...
5965,neutral,@YouAreMyArsenal Wouldn't surprise me if we en...
5966,neutral,Rib injury for Zlatan against Russia is a big ...


In [None]:
test_df     # Checking testing data manually

Unnamed: 0,Id,Category
0,6.289494e+17,dear @Microsoft the newOoffice for Mac is grea...
1,6.289766e+17,@Microsoft how about you make a system that do...
2,6.290232e+17,Not Available
3,6.291792e+17,Not Available
4,6.291863e+17,If I make a game as a #windows10 Universal App...
...,...,...
9963,,
9964,,
9965,,
9966,,


In [None]:
test_df=test_df.rename(columns={"Category":"Tweets"})   # Renaming 'Category' as 'Tweets'

In [None]:
# Cleaning test data
test_df=test_df.drop(columns=["Id"])                  # Id irrelevant for testing model
test_df=test_df.dropna()                              # Dropping null values
test_df=test_df[test_df['Tweets']!='Not Available']   # Removing 'Not Available' tweets
test_df                                               # Printing modified test data

Unnamed: 0,Tweets
0,dear @Microsoft the newOoffice for Mac is grea...
1,@Microsoft how about you make a system that do...
4,If I make a game as a #windows10 Universal App...
5,"Microsoft, I may not prefer your gaming branch..."
6,@MikeWolf1980 @Microsoft I will be downgrading...
...,...
3994,Anybody with a Steak &amp; Shake or IHOP move ...
3995,I am assembling an epic Pancake Posse for an I...
3996,do you work at Ihop tomorrow @carlysunshine_
3997,23 Aug 00;30 #771NAS Rescue193 returned from T...


In [None]:
train_clean_df,test_clean_df=train_test_split(train_df,test_size=0.15)     # Splitting the data into two only possible when not using label, train size = 0.85rain

In [None]:
# Using list for further cleaning, pandas can be used too
train_set=list(train_clean_df.to_records(index=False))      # Changes df to list and adds to train_set
test_set=list(test_clean_df.to_records(index=False))        # Changes df to list and adds to test_set

In [None]:
def remove_links_mentions(tweet):                           # Function to remove mentions and links 
  link_re_pattern="https?:\/\/t.co[\w]+"                    # defining pattern for link
  link_re_pattern_second="http?:\/\/t.co[\w]+"
  mention_re_pattern = "@\w+"                               # Defining pattern for mentions
  tweet = re.sub(link_re_pattern, " ", tweet)               # Using regex to replace link with space
  tweet = re.sub(mention_re_pattern," ", tweet)             # Using regex to replace mention with space
  return tweet.lower()                                      # Returns the tweet in lower case

In [None]:
train_set = [(label ,remove_links_mentions(tweet)) for label,tweet in train_set]

In [None]:
train_set_df = pd.DataFrame(train_set, columns=['Category','Tweet'])

In [None]:
test_set = [(label ,remove_links_mentions(tweet)) for label,tweet in test_set]

In [None]:
test_set_df = pd.DataFrame(test_set, columns=['Category','Tweet'])

In [None]:
train_df=train_set_df.append(test_set_df,ignore_index=True)     

###Class imbalance
Here we need to make sure that each category has equal number of instances so that model is trained well (Classification task)

In [None]:
train_df['Category'].value_counts()   # Counting to check for class imbalance

positive    2599
neutral     1953
negative     869
Tweet          1
Name: Category, dtype: int64

Solutions to fix imbalance
1) Oversampling
2) Undersampling
3) Hybrid approach
4) Augmentation

In [None]:
train_df[train_df['Category']=='negative']

Unnamed: 0,Category,Tweet
29,negative,"trump, cruz to rally against iran deal: on wed..."
30,negative,it was a disrespectful travestys for potus to ...
34,negative,i think on some day's may the spirt of kurt co...
38,negative,1st day back at work after a terrible week off...
41,negative,"ukip boy may have deleted his twitter profile,..."
...,...,...
5404,negative,"i understand why people may support them, bu..."
5410,negative,dear ppl who attribute vocal fry trend to brit...
5411,negative,if you violate a restraining order and have to...
5415,negative,when sarah palin and donald trump speak for am...


In [None]:
train_df['Tweet'].loc[3]

"netflix set to make new series of charlie brooker's dystopian drama black mirror | media | the guardian http://t.co/llw1xqniwh"

In [None]:
pip install nlpaug      # Installing for text augment

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
nltk.download('averaged_perceptron_tagger')     # Contains pre trained POS tagger. Tagger labels the word into different categories such as noun, adjective etc.

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
nltk.download('wordnet')    # Used for synonyms

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
import nlpaug
import nlpaug.augmenter.word as naw   # Augment words

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_df['Tweet'],train_df['Category'])   # Data split into lists to easily perform operation 

In [None]:
aug=naw.SynonymAug(aug_src='wordnet',aug_max=3)     # Synonyms for words are used to make different sentences, here max maximum synonyms for a word used are 3, wordent is used to accecc synonyms

In [None]:
augmented_sentences=[]                              # List to save augmented sentences
augmented_sentences_labels=[]                       # List to save category
for i in X_train.index:                             
  if y_train[i]=='negative':                        # Loop will only work for negative tweets
    temps=aug.augment(X_train[i],n=2)               # Maximum 2 augmented sentences will be formed
    for sent in temps:
      augmented_sentences.append(sent)              # Sentences saved in augmented sentence list
      augmented_sentences_labels.append('negative') # label saved

In [None]:
X_train=X_train.append(pd.Series(augmented_sentences),ignore_index=True)        # Adding the list of augmented sentences to  X-train
y_train=y_train.append(pd.Series(augmented_sentences_labels),ignore_index=True) # Adding the category to y_train

In [None]:
df=pd.DataFrame(list(zip(y_train,X_train)),columns = ['Category','Tweet'])        # Changing the lists to datarame

In [None]:
df1=pd.DataFrame(list(zip(y_test,X_test)),columns = ['Category','Tweet'])         # Changing list to dataframe

In [None]:
tdf=df.append(df1,ignore_index=True)                                              # Appending the two list to get the data together in 1 datafrane

In [None]:
tdf['Category'].value_counts()                                                    # Checking for class imbalance

positive    2599
negative    2181
neutral     1953
Tweet          1
Name: Category, dtype: int64

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_df['Tweet'],train_df['Category'])

In [None]:
aug=naw.SynonymAug(aug_src='wordnet',aug_max=3)

In [None]:
augmented_sentences=[]
augmented_sentences_labels=[]
for i in X_train.index:
  if y_train[i]=='neutral':
    temps=aug.augment(X_train[i],n=2)
    for sent in temps:
      augmented_sentences.append(sent)
      augmented_sentences_labels.append('neutral')

In [None]:
X_train=X_train.append(pd.Series(augmented_sentences),ignore_index=True)
y_train=y_train.append(pd.Series(augmented_sentences_labels),ignore_index=True)

In [None]:
df=pd.DataFrame(list(zip(y_train,X_train)),columns = ['Category','Tweet'])

In [None]:
df1=pd.DataFrame(list(zip(y_train,X_train)),columns = ['Category','Tweet'])

In [None]:
tdf1=df.append(df1,ignore_index=True)

In [None]:
tdf1['Category'].value_counts()

neutral     8700
positive    3922
negative    1308
Tweet          2
Name: Category, dtype: int64

## Tokenization

In [None]:
train_clean_df,test_clean_df=train_test_split(tdf,test_size=0.15)     # Splitting the data into two only possible when not using label, train size = 0.85

In [None]:
# Using list for further cleaning, pandas can be used too
train_set=list(train_clean_df.to_records(index=False))      # Changes df to list and adds to train_set
test_set=list(test_clean_df.to_records(index=False))        # Changes df to list and adds to test_set

In [None]:
train_set= [(label, word_tokenize(remove_links_mentions(tweet))) for label,tweet in train_set]   # Tokenize words in train set
train_set[:1]

[('positive',
  ['may',
   'i',
   'just',
   'say',
   'that',
   'tom',
   'cruise',
   'looks',
   'hot',
   'as',
   'fuck',
   'in',
   'a',
   'navy',
   'blue',
   'suit'])]

In [None]:
test_set= [(label, word_tokenize(remove_links_mentions(tweet))) for label,tweet in test_set]   # Tokenize words in train set
test_set[:1]

[('neutral',
  ['what',
   "'s",
   'it',
   'like',
   'to',
   'have',
   'a',
   'girlfriend',
   '?',
   'i',
   'got',
   'a',
   'valentine',
   "'s",
   'day',
   'heart',
   'with',
   'a',
   'piece',
   'of',
   'candy',
   'in',
   '7th',
   'grade',
   '.',
   'basically',
   'the',
   'same',
   'thing',
   '.'])]

## Creating vocabulary

In [None]:
index2word = ["<SOS>","<EOS>"]
for ds in [train_set, test_set]:
  for label,tweet in ds:
    for token in tweet:
      if token not in index2word:
        index2word.append(token)
index2word[2]
word2index = {token: idx for idx, token in enumerate(index2word)}
word2index["may"]

2

In [None]:
def encode(tweet):
  sos = [word2index["<SOS>"]]
  eos = [word2index["<EOS>"]]
  encoded = [word2index[w] for w in tweet]
  return sos + encoded + eos  

Encode categories too as 0,1 and 2 respectively

In [None]:
def label_map(label):
    if label == "negative":
        return 0
    elif label == "neutral":
        return 1
    else: #positive
        return 2

In [None]:
train_encode= [(encode(tweet),label_map(label)) for label,tweet in train_set]

In [None]:
for i in train_encode[:2]:
  print(i)

([0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1], 2)
([0, 18, 19, 20, 21, 22, 13, 23, 24, 24, 24, 1], 2)
