In [1]:
# importing required libraries
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [2]:
# read the dataset
data = pd.read_csv('dataset/5.corona_data_anxious_not_anxious.csv')
# view the top rows
data.head()

Unnamed: 0,Text,label
0,autocomplet result avail use arrow review ente...,0
1,2020 2019 2018 2017 2016 2015 2014 2013 2012 2...,0
2,statement resum 73rd world health assembl chai...,1
3,global public health day offer great potenti r...,0
4,autocomplet result avail use arrow review ente...,0


In [3]:
# train test split
train, test = train_test_split(data, test_size = 0.2, stratify = data['label'], random_state=21)

# get the shape of train and test split.
train.shape, test.shape

((861, 2), (216, 2))

In [4]:
# create a TF-IDF vectorizer object
tfidf_vectorizer = TfidfVectorizer(max_features=1000)

# fit the object with the training data tweets
tfidf_vectorizer.fit(train.Text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=1000,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [5]:
# transform the train and test data
train_idf = tfidf_vectorizer.transform(train.Text)
test_idf  = tfidf_vectorizer.transform(test.Text)

In [6]:
train_idf , test_idf

(<861x1000 sparse matrix of type '<class 'numpy.float64'>'
 	with 29370 stored elements in Compressed Sparse Row format>,
 <216x1000 sparse matrix of type '<class 'numpy.float64'>'
 	with 7893 stored elements in Compressed Sparse Row format>)

In [7]:
# create the object of LogisticRegression Model
model_LR = LogisticRegression()

# fit the model with the training data
model_LR.fit(train_idf, train.label)

# predict the label on the traning data
predict_train = model_LR.predict(train_idf)

# predict the model on the test data
predict_test = model_LR.predict(test_idf)




**f1 score on train data**

In [8]:
f1_score(y_true= train.label, y_pred= predict_train, average='macro')


0.9924898075960231

In [9]:
f1_score(y_true= test.label, y_pred= predict_test, average='macro')


0.9693242821029908

In [10]:
# define the stages of the pipeline
pipeline = Pipeline(steps= [('tfidf', TfidfVectorizer(max_features=1000)), ('model', LogisticRegression())])

# fit the pipeline model with the training data                            
pipeline.fit(train.Text, train.label)



Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=1000,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('model',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scali

In [11]:
# sample tweet
text = ["covid"]

# predict the label using the pipeline
pipeline.predict(Text)


NameError: name 'Text' is not defined

In [None]:
# import joblib
from joblib import dump

# dump the pipeline model
dump(pipeline, filename="text_classification.joblib")

In [None]:
# import joblib
from joblib import load

# sample text
text = ["covid is dangerous"]

# load the saved pipleine model
pipeline = load("text_classification.joblib")

# predict on the sample text
pipeline.predict(text)


In [None]:
data