# Sentiment Analysis Of Tweets with Flair

## Install Flair

In [1]:
!pip install flair

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting flair
  Downloading flair-0.11.3-py3-none-any.whl (401 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.9/401.9 KB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 KB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting segtok>=1.5.7
  Downloading segtok-1.5.11-py3-none-any.whl (24 kB)
Collecting conllu>=4.0
  Downloading conllu-4.5.2-py2.py3-none-any.whl (16 kB)
Collecting hyperopt>=0.2.7
  Downloading hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)


## Imports

In [3]:
from IPython.core.display import display, HTML
display(HTML('<style> .container {width:90% !important} </style>'))

import warnings
warnings.filterwarnings('ignore')

import flair
import pandas as pd
import numpy as np
import copy
import re

# Import the data
Data  is taken from: https://datahack.analyticsvidhya.com/contest/linguipedia-codefest-natural-language-processing-1/#ProblemStatement

In [4]:
df = pd.read_csv('/content/sample_data/train.csv')
print('Shape of the dataframe:', df.shape)
print('Columns:', df.columns)

Shape of the dataframe: (7920, 3)
Columns: Index(['id', 'label', 'tweet'], dtype='object')


In [20]:
df.head()

# 0 --> positive label
# 1 --> negative label

Unnamed: 0,id,label,tweet,clean_tweet,labels_scores,predicted_label_name,predicted_scores,predicted_label_value
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,fingerprint Pregnancy Test https goo gl h MfQ...,"(POSITIVE, 0.5724265575408936)",Positive,0.5724,0
1,2,0,Finally a transparant silicon case ^^ Thanks t...,Finally a transparant silicon case Thanks to m...,"(POSITIVE, 0.9993873834609985)",Positive,0.9994,0
2,3,0,We love this! Would you go? #talk #makememorie...,We love this Would you go talk makememories un...,"(POSITIVE, 0.9769495129585266)",Positive,0.9769,0
3,4,0,I'm wired I know I'm George I was made that wa...,I m wired I know I m George I was made that wa...,"(POSITIVE, 0.7664690017700195)",Positive,0.7665,0
4,5,1,What amazing service! Apple won't even talk to...,What amazing service Apple won t even talk to ...,"(POSITIVE, 0.579282820224762)",Positive,0.5793,0



# Check For Null Values

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7920 entries, 0 to 7919
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     7920 non-null   int64  
 1   label                  7920 non-null   int64  
 2   tweet                  7920 non-null   object 
 3   clean_tweet            7920 non-null   object 
 4   labels_scores          7920 non-null   object 
 5   predicted_label_name   7920 non-null   object 
 6   predicted_scores       7920 non-null   float64
 7   predicted_label_value  7920 non-null   int64  
dtypes: float64(1), int64(3), object(4)
memory usage: 495.1+ KB


# Data Cleaning: removing punctuations, numbers, spaces

In [7]:
df['clean_tweet'] = df['tweet'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: re.sub(' +', ' ', x))

### Initialize the flair model

In [8]:
flair_model = flair.models.TextClassifier.load('en-sentiment')
print('Flair Model Loaded...')

2023-02-24 17:43:14,040 https://nlp.informatik.hu-berlin.de/resources/models/sentiment-curated-distilbert/sentiment-en-mix-distillbert_4.pt not found in cache, downloading to /tmp/tmpokkzjuf7


100%|██████████| 265512723/265512723 [00:25<00:00, 10570087.06B/s]

2023-02-24 17:43:39,981 copying /tmp/tmpokkzjuf7 to cache at /root/.flair/models/sentiment-en-mix-distillbert_4.pt





2023-02-24 17:43:41,021 removing temp file /tmp/tmpokkzjuf7
2023-02-24 17:43:41,092 loading file /root/.flair/models/sentiment-en-mix-distillbert_4.pt


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Flair Model Loaded...


### Sentiment Analysis Function

In [12]:
def sentiment_analysis(tweet_col, flair_model):

    tweet_tokens = flair.data.Sentence(tweet_col)
   
    flair_model.predict(tweet_tokens)

    label = tweet_tokens.get_labels()[0].value
    score = tweet_tokens.get_labels()[0].score

    return (label, score)

## Calling sentiment_analysis

In [13]:
df['labels_scores'] = df.apply(lambda x: sentiment_analysis(x['clean_tweet'], flair_model), axis=1)

In [15]:
df.head(10)

Unnamed: 0,id,label,tweet,clean_tweet,labels_scores
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,fingerprint Pregnancy Test https goo gl h MfQ...,"(POSITIVE, 0.5724265575408936)"
1,2,0,Finally a transparant silicon case ^^ Thanks t...,Finally a transparant silicon case Thanks to m...,"(POSITIVE, 0.9993873834609985)"
2,3,0,We love this! Would you go? #talk #makememorie...,We love this Would you go talk makememories un...,"(POSITIVE, 0.9769495129585266)"
3,4,0,I'm wired I know I'm George I was made that wa...,I m wired I know I m George I was made that wa...,"(POSITIVE, 0.7664690017700195)"
4,5,1,What amazing service! Apple won't even talk to...,What amazing service Apple won t even talk to ...,"(POSITIVE, 0.579282820224762)"
5,6,1,iPhone software update fucked up my phone big ...,iPhone software update fucked up my phone big ...,"(NEGATIVE, 0.9999022483825684)"
6,7,0,Happy for us .. #instapic #instadaily #us #son...,Happy for us instapic instadaily us sony xperi...,"(POSITIVE, 0.9793867468833923)"
7,8,0,New Type C charger cable #UK http://www.ebay.c...,New Type C charger cable UK http www ebay co u...,"(POSITIVE, 0.969228208065033)"
8,9,0,Bout to go shopping again listening to music #...,Bout to go shopping again listening to music i...,"(POSITIVE, 0.978144109249115)"
9,10,0,Photo: #fun #selfie #pool #water #sony #camera...,Photo fun selfie pool water sony camera picoft...,"(POSITIVE, 0.9925767779350281)"


In [17]:
df['predicted_label_name'] = df['labels_scores'].apply(lambda x: x[0].title())
df['predicted_scores'] = df['labels_scores'].apply(lambda x: np.round(x[1],4))

In [22]:
label_dic = {'Positive':0, 'Negative':1}
df['predicted_label_value'] = df['predicted_label_name'].map(label_dic)

df.head()

Unnamed: 0,id,label,tweet,clean_tweet,labels_scores,predicted_label_name,predicted_scores,predicted_label_value
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,fingerprint Pregnancy Test https goo gl h MfQ...,"(POSITIVE, 0.5724265575408936)",Positive,0.5724,0
1,2,0,Finally a transparant silicon case ^^ Thanks t...,Finally a transparant silicon case Thanks to m...,"(POSITIVE, 0.9993873834609985)",Positive,0.9994,0
2,3,0,We love this! Would you go? #talk #makememorie...,We love this Would you go talk makememories un...,"(POSITIVE, 0.9769495129585266)",Positive,0.9769,0
3,4,0,I'm wired I know I'm George I was made that wa...,I m wired I know I m George I was made that wa...,"(POSITIVE, 0.7664690017700195)",Positive,0.7665,0
4,5,1,What amazing service! Apple won't even talk to...,What amazing service Apple won t even talk to ...,"(POSITIVE, 0.579282820224762)",Positive,0.5793,0


## Validating model weighted f1 score

In [19]:
from sklearn.metrics import f1_score

print('Weighted F1 SCore:', f1_score(df['label'], df['predicted_label_value'], average='weighted'))

Weighted F1 SCore: 0.691450951281932
