In [1]:
# importing required libraries
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [2]:
# read the dataset
data = pd.read_csv('dataset/5.corona_data_anxious_not_anxious.csv')
# view the top rows
data.head()

Unnamed: 0,Text,label
0,2020 2019 2018 2017 2016 2015 2014 2013 2012 2...,0
1,statement resum 73rd world health assembl chai...,0
2,digit updat find latest covid 19 content guida...,0
3,health emerg highlight issu health emerg highl...,0
4,global 9 33am cet 27 decemb 2020 79 062 802 co...,1


In [3]:
# train test split
train, test = train_test_split(data, test_size = 0.2, stratify = data['label'], random_state=21)

# get the shape of train and test split.
train.shape, test.shape

((308, 2), (78, 2))

In [4]:
# create a TF-IDF vectorizer object
tfidf_vectorizer = TfidfVectorizer(max_features=1000)

# fit the object with the training data tweets
tfidf_vectorizer.fit(train.Text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=1000,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [5]:
# transform the train and test data
train_idf = tfidf_vectorizer.transform(train.Text)
test_idf  = tfidf_vectorizer.transform(test.Text)

In [6]:
train_idf , test_idf

(<308x1000 sparse matrix of type '<class 'numpy.float64'>'
 	with 16553 stored elements in Compressed Sparse Row format>,
 <78x1000 sparse matrix of type '<class 'numpy.float64'>'
 	with 3597 stored elements in Compressed Sparse Row format>)

In [7]:
# create the object of LogisticRegression Model
model_LR = LogisticRegression()

# fit the model with the training data
model_LR.fit(train_idf, train.label)

# predict the label on the traning data
predict_train = model_LR.predict(train_idf)

# predict the model on the test data
predict_test = model_LR.predict(test_idf)




**f1 score on train data**

In [8]:
f1_score(y_true= train.label, y_pred= predict_train, average='macro')


0.9936688311688311

In [9]:
f1_score(y_true= test.label, y_pred= predict_test, average='macro')


0.9486521181001284

In [10]:
# define the stages of the pipeline
pipeline = Pipeline(steps= [('tfidf', TfidfVectorizer(max_features=1000)), ('model', LogisticRegression())])

# fit the pipeline model with the training data                            
pipeline.fit(train.Text, train.label)



Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=1000,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('model',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scali

In [11]:
# sample tweet
text = ["covid"]

# predict the label using the pipeline
pipeline.predict(text)


array([1], dtype=int64)

In [12]:
# import joblib
from joblib import dump

# dump the pipeline model
dump(pipeline, filename="text_classification.joblib")

['text_classification.joblib']

In [13]:
# import joblib
from joblib import load

# sample text
text = ["covid is dangerous"]

# load the saved pipleine model
pipeline = load("text_classification.joblib")

# predict on the sample text
pipeline.predict(text)


array([1], dtype=int64)

In [14]:
data

Unnamed: 0,Text,label
0,2020 2019 2018 2017 2016 2015 2014 2013 2012 2...,0
1,statement resum 73rd world health assembl chai...,0
2,digit updat find latest covid 19 content guida...,0
3,health emerg highlight issu health emerg highl...,0
4,global 9 33am cet 27 decemb 2020 79 062 802 co...,1
...,...,...
381,m8 glasgow citi eastbound sever disrupt j16 ca...,2
382,m5 bristol southbound sever disrupt a4 m5 bris...,2
383,conserv group wwf warn koala could wipe austra...,2
384,a47 norfolk westbound sever accid shack lane a...,2
