# Building an NLP Model - From Data Collection to Model Evaluation

## 1: Data Collection

In [1]:
import pandas as pd

#### Social media post 

In [2]:
df=pd.read_csv('train.csv')
df

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
...,...,...,...
31957,31958,0,ate @user isz that youuu?ðððððð...
31958,31959,0,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,..."


In [3]:
df.isna().sum()

id       0
label    0
tweet    0
dtype: int64

In [4]:
df.drop_duplicates()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
...,...,...,...
31957,31958,0,ate @user isz that youuu?ðððððð...
31958,31959,0,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,..."


## 2: Data Preprocessing

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
import re

In [6]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Bibhakar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Bibhakar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
# Tokenization function
def tokenize(text):
    tokens = word_tokenize(text)
    return tokens

# Stopwords removal
stop_words = set(stopwords.words('english'))

# Stemming function
stemmer = SnowballStemmer(language='english')
def stem_tokens(tokens, stemmer):
    stemmed = [stemmer.stem(token) for token in tokens]
    return stemmed

# Preprocessing function
def preprocess_text(text):
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    tokens = tokenize(text)
    filtered_tokens = [word.lower() for word in tokens if word.lower() not in stop_words]
    stemmed_tokens = stem_tokens(filtered_tokens, stemmer)
    return ' '.join(stemmed_tokens)

# Apply preprocessing to the tweet_text column
df['preprocessed_text'] = df['tweet'].apply(preprocess_text)

In [8]:
df

Unnamed: 0,id,label,tweet,preprocessed_text
0,1,0,@user when a father is dysfunctional and is s...,user father dysfunct selfish drag kid dysfunct...
1,2,0,@user @user thanks for #lyft credit i can't us...,user user thank lyft credit cant use caus dont...
2,3,0,bihday your majesty,bihday majesti
3,4,0,#model i love u take with u all the time in ...,model love u take u time ur
4,5,0,factsguide: society now #motivation,factsguid societi motiv
...,...,...,...,...
31957,31958,0,ate @user isz that youuu?ðððððð...,ate user isz youuu
31958,31959,0,to see nina turner on the airwaves trying to...,see nina turner airwav tri wrap mantl genuin h...
31959,31960,0,listening to sad songs on a monday morning otw...,listen sad song monday morn otw work sad
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,...",user sikh templ vandalis calgari wso condemn act


In [9]:
df['preprocessed_text']

0        user father dysfunct selfish drag kid dysfunct...
1        user user thank lyft credit cant use caus dont...
2                                           bihday majesti
3                              model love u take u time ur
4                                  factsguid societi motiv
                               ...                        
31957                                   ate user isz youuu
31958    see nina turner airwav tri wrap mantl genuin h...
31959             listen sad song monday morn otw work sad
31960     user sikh templ vandalis calgari wso condemn act
31961                                    thank user follow
Name: preprocessed_text, Length: 31962, dtype: object

## 3: Feature Extraction

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize a count Vectorizer
vectorizer = CountVectorizer()
count_matrix = vectorizer.fit_transform(df['preprocessed_text'])

# Output vocabulary and count vector matrix
print("Count Matrix:\n", count_matrix.toarray())

Count Matrix:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [11]:
count_matrix.shape

(31962, 33462)

## 4: Model Selection and Training

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score

In [13]:
df['label']

0        0
1        0
2        0
3        0
4        0
        ..
31957    0
31958    0
31959    0
31960    1
31961    0
Name: label, Length: 31962, dtype: int64

In [14]:
labels = df['label']
# Split the dataset into training, validation and testing sets
train_val_input, test_input, train_val_target, test_target = train_test_split(count_matrix, labels, test_size=0.3,random_state=42)

train_input, val_input, train_target, val_target= train_test_split(train_val_input, train_val_target, test_size=0.25, random_state=54)

In [15]:
model = LogisticRegression(max_iter=1000)
model.fit(train_input, train_target)

## 5: Evaluation

In [16]:
val_preds = model.predict(val_input)
val_accuracy = accuracy_score(val_target, val_preds)
print(f"Validation Accuracy Score: {val_accuracy}")

Validation Accuracy Score: 0.9619234894529853


In [17]:
predictions = model.predict(test_input)
accuracy = accuracy_score(test_target, predictions)
print(f"Accuracy Score: {accuracy}")

Accuracy Score: 0.9545312337052874


In [18]:
print('Classification Report:\n', classification_report(test_target, predictions))

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.98      8905
           1       0.81      0.47      0.60       684

    accuracy                           0.95      9589
   macro avg       0.89      0.73      0.79      9589
weighted avg       0.95      0.95      0.95      9589



In [19]:
print(f"Precision Score: {precision_score(test_target, predictions, average=None)}")

Precision Score: [0.96052202 0.81472081]


In [20]:
print(f"Precision Score: {recall_score(test_target, predictions, average=None)}")

Precision Score: [0.99180236 0.46929825]


In [21]:
print(f"Precision Score: {f1_score(test_target, predictions, average=None)}")

Precision Score: [0.9759116  0.59554731]


# BIBHAKAR PAUL