### 1. Importing all required modules

Before importing, please make sure all modules are installed before running the project.

In [1]:
import pandas
import numpy
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, jaccard_score

import re  # library for regular expression operations
import string  # for string operations

import nltk
from nltk.corpus import stopwords  # module for stop words that come with NLTK
from nltk.stem import WordNetLemmatizer  # module for stemming
from nltk.tokenize import TweetTokenizer  # module for tokenizing strings
from nltk.sentiment import SentimentIntensityAnalyzer

import matplotlib.pyplot as plt

### 2. Download all necessary nltk files

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('vader_lexicon')

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading wordnet: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading omw-1.4: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading vader_lexicon: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

### 3. Importing the dataset

In [3]:
df = pandas.read_csv("covid_dataset.csv")

df.head(10)

Unnamed: 0,ID,Tweet,Sentiment
0,0,BanMediaHouse whose is responsible for spreadi...,1
1,1,Im waiting for someone to say to me that all t...,1
2,2,He is a liar. Proven day night. Time again. Li...,1
3,3,"NEW: U.S. CoronaVirus death toll reaches 4,000...",0
4,4,Coronavirus impact Govt extends I-T deadlines ...,0
5,5,"42,000 people might have died in China from Co...",0
6,6,This is how the govt of kenya is checking the ...,0
7,7,"you are a complete wanker, and I hope you die ...",1
8,8,Just heard that my Oncle in France has the Cov...,1
9,9,That moment you realize your new medication ha...,0


#### Deleting unnecessary columns 

In [4]:
try:
    df.drop(['ID'], axis=1, inplace=True)
except KeyError:
    pass

df.head(10)

Unnamed: 0,Tweet,Sentiment
0,BanMediaHouse whose is responsible for spreadi...,1
1,Im waiting for someone to say to me that all t...,1
2,He is a liar. Proven day night. Time again. Li...,1
3,"NEW: U.S. CoronaVirus death toll reaches 4,000...",0
4,Coronavirus impact Govt extends I-T deadlines ...,0
5,"42,000 people might have died in China from Co...",0
6,This is how the govt of kenya is checking the ...,0
7,"you are a complete wanker, and I hope you die ...",1
8,Just heard that my Oncle in France has the Cov...,1
9,That moment you realize your new medication ha...,0


#### Assigning the variables

In [5]:
x = df['Tweet']
y = df['Sentiment']

#### Average length of tweet

In [6]:
tweet_length = 0
for tweet in x:
    tweet_length += len(tweet)

average_length = tweet_length / len(x)

print(f'Average length of tweet = {round(average_length, 2)} characters')

Average length of tweet = 97.61 characters


### 4. Processing of Tweets i.e. removing symbols, stopwords etc from the tweet, tokenization of tweet and lemmatization the tweets.

We are using lemmatization instead of stemming because stemming uses stem (base) of the word to analyse it.
While lemmatization uses context of the word to analyse the text and reduce it.
That is why lemmatization is more accurate in this case.

In [7]:
## param: string
## return : list of processed string ['first', 'second', 'third']  

def process_tweet(tweets):
    lemmatizer = WordNetLemmatizer()
    stopwords_english = stopwords.words('english')

    # remove stock market stickers like $GE
    tweets = re.sub(r'\$\w*', '', tweets)

    # remove old style retweet text "RT"
    tweets = re.sub(r'^RT[\s]+', '', tweets)

    # remove hyperlinks
    tweets = re.sub(r'https?:\/\/.*[\r\n]*', '', tweets)

    # remove hashtags, only removing the hash # sign from the word
    tweets = re.sub(r'#', '', tweets)

    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweets)

    clean_tweets = []
    for word in tweet_tokens:
        # remove stopwords and remove punctuation
        if (word not in stopwords_english and word not in string.punctuation):  
            
            # lemmatizing the word
            lemmatized_word = lemmatizer.lemmatize(word) 
            
            clean_tweets.append(lemmatized_word)

    return clean_tweets

#### Calculating the length of tweet and polarity score of that tweet


In [8]:
# param : list of string (tweet)
# return : 2-D list of tweet length and polarity scores e.g. [[23, 0.12, 0.34, 0.45], ...]

def get_polarity_score(tweets):
    sentiment_analyzer = SentimentIntensityAnalyzer()
    sentiment_score = []
    for tweet in tweets:
        positivity_score = sentiment_analyzer.polarity_scores(tweet)['pos']
        negativity_score = sentiment_analyzer.polarity_scores(tweet)['neg']
        #neutrality_score = sentiment_analyzer.polarity_scores(tweet)['neu']
        #compound_score = sentiment_analyzer.polarity_scores(tweet)['compound']
        #sentiment_score.append([len(tweet), positivity_score, negativity_score, neutrality_score, compound_score])
        sentiment_score.append([len(tweet), positivity_score, negativity_score])
    return sentiment_score


#### Iterating over the each tweet and then processing it and finally storing processed tweets in a list

In [9]:
# an empty list to store all processed tweets
processed_x = []
for tweet in x:
    clean_tweet = " ".join(process_tweet(tweet))
    processed_x.append(clean_tweet)
    
processed_x

['banmediahouse whose responsible spreading fake communal story pandemic corona situation',
 'im waiting someone say corona thing april fool joke',
 'liar proven day night time lie truth covid 19',
 'new u coronavirus death toll reach 4,000 nearly 900 new death reported today bno news covid 19 coronavirusoutbreak',
 'coronavirus impact govt extends i-t deadline related section 80c 80d',
 '42,000 people might died china covid 19 china underreporting according source',
 'govt kenya checking temperature covid 19 saying still low mtashangaa sana',
 'complete wanker hope die coronavirus fucking dickhead',
 'heard oncle france covid 19 coma critical condition guy please aint joke safe',
 'moment realize new medication side effect identical corona virus symptom know',
 'three team fighting covid 19 day night 1 doctor nurse policeman 2 khan pr team 3 cell',
 "six new coronavirus case reported karnataka taking state's tally 181 coronavirusoutbreak cautionyespanicno",
 'lot chatter 16000 staff n

### 5. Splitting the dataset for training and testing

This increases out-of-sample accuracy of the model

In [10]:
train_x, test_x, train_y, test_y = train_test_split(processed_x, y, test_size=0.3, random_state=42)

### 6. Taking user input i.e. Tweet by actual user and then analysing its sentiment.

Given that user has to provide actual sentiment of the Tweet in order to perform evaluation metrics of the model.

In [None]:
tweet_input = input("Enter your tweet: ")
test_x.append(" ".join(process_tweet(tweet_input)))

print("What is the actual Sentiment of the Tweet? Is the tweet covid-positive or covid-negative")
print("Input is essential for calculating model accuracy")
sentiment_input = input('Type: 0 for negative, 1 for positive: ')
senti = [0, 1]


test_y = list(test_y)
while int(sentiment_input) not in senti:
    sentiment_input = input('Wrong input. Try again!: ')

test_y.append(int(sentiment_input))

### 7. Feature extraction:
We are extracting two features from tweets.
1. length of processed tweet
2. polarity score i.e. amount of positivity , negativity in each tweet.

We are storing feature in a 2-D array as it is required by SVC class

In [None]:
feature = get_polarity_score(train_x)

feature

#### Typecasting training set variables to numpy array

In [None]:
train_x = numpy.array(feature)
train_y = numpy.array(train_y)


### 8. Model training

Dimension of variable 'feature' has to be 2 and dimension of variable 'y' has to be 1 as required by SVC classifier

In [None]:
classify = SVC(kernel="linear")
classify.fit(train_x, train_y)

### 9. Processing of Test Dataset 

In [None]:
clean_tweets = []

for tweet in test_x:
    clean_tweets.append(" ".join(process_tweet(tweet)))

test_feature = get_polarity_score(clean_tweets)

test_feature

#### Typecasting testing set variables into numpy array

In [None]:
test_feature_x = numpy.array(test_feature)
test_y = numpy.array(test_y)

### 10. Model Prediction / Testing 

In [None]:
predict = classify.predict(test_feature_x)

print(f'Tweet: {tweet_input}')
print(f'Tokens: {process_tweet(tweet_input)}')

sentiment = predict[-1]
sentiment = 'Positive' if sentiment == 1 else 'Negative'

print(f'Sentiment: {sentiment}')

### 11. Evaluation metrics

In [None]:
print(f"F1 Score = {f1_score(test_y, predict, average='weighted')}")

In [None]:
print(f"Jaccard Index = {jaccard_score(test_y, predict)}")

### 12. Model Plotting

In [None]:
plt.scatter(test_feature_x[:, 0], test_y, color='r')
plt.scatter(test_feature_x[:, 1], test_y, color='g')
plt.scatter(test_feature_x[:, 2], test_y, color='m')

# some colours in legend are not visible on plot because of overlapping but they are being rendered
plt.legend(['Length of Tweet', 'Positivity Score', 'Negativity Score'], title="Features")
plt.title("Sentiment against feature")

plt.xlabel("Features", size=14)
plt.ylabel("Sentiment", size=14)

plt.show()