### Import libraries

In [1]:
import nltk
import re
import pandas as pd
import spacy
import string
from string import punctuation
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.models import LdaModel,CoherenceModel
from nltk import pos_tag
from gensim import corpora

In [2]:
from nltk.stem import WordNetLemmatizer

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anees\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anees\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\anees\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### 1.Load the tweets file using the read_csv function from the Pandas package

In [5]:
df=pd.read_csv('TwitterHate.csv')
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB


In [7]:
df.isna().sum()

id       0
label    0
tweet    0
dtype: int64

In [8]:
df.shape

(31962, 3)

### 3.Apply the following steps to clean up the tweets

a) Normalize the casing

In [9]:
tweets=list(df['tweet'].str.lower())
tweets[:5]

[' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
 "@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
 '  bihday your majesty',
 '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ',
 ' factsguide: society now    #motivation']

### b) Use regular expressions and remove user handles that begin with @

In [10]:
tweets1=[re.sub(r'@\w+',' ', text) for text in tweets]
tweets1[:5]

['   when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
 "    thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
 '  bihday your majesty',
 '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ',
 ' factsguide: society now    #motivation']

### c) Use regular expressions, and remove URLs

In [11]:
text_cleanup1=[re.sub(r'http\S+', ' ',t).strip() for t in tweets1]
text_cleanup1[:5]

['when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
 "thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
 'bihday your majesty',
 '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦',
 'factsguide: society now    #motivation']

### d) UseTweetTokenizer from NLTK to tokenize the tweets into individual terms

In [12]:
from nltk.tokenize import TweetTokenizer

In [13]:
tokenizer=TweetTokenizer(preserve_case=True,strip_handles=True, reduce_len=True)
tokenized_tweets=[tokenizer.tokenize(tweet)for tweet in text_cleanup1]

### e) Remove stop words

In [14]:
stop_words=set(stopwords.words('english'))
text_cleanup2=[[word for word in tweet if word not in stop_words] for tweet in tokenized_tweets]
print(text_cleanup2[:5])

[['father', 'dysfunctional', 'selfish', 'drags', 'kids', 'dysfunction', '.', '#run'], ['thanks', '#lyft', 'credit', "can't", 'use', 'cause', 'offer', 'wheelchair', 'vans', 'pdx', '.', '#disapointed', '#getthanked'], ['bihday', 'majesty'], ['#model', 'love', 'u', 'take', 'u', 'time', 'urð', '\x9f', '\x93', '±', '!', '!', '!', 'ð', '\x9f', '\x98', '\x99', 'ð', '\x9f', '\x98', '\x8e', 'ð', '\x9f', '\x91', '\x84', 'ð', '\x9f', '\x91', 'ð', '\x9f', '\x92', '¦', 'ð', '\x9f', '\x92', '¦', 'ð', '\x9f', '\x92', '¦'], ['factsguide', ':', 'society', '#motivation']]


### f) Remove redundant terms like amp and rt

In [15]:
redundant_terms=['amp','rt']
text_cleanup3=[[word for word in tweet if word not in redundant_terms] for tweet in text_cleanup2]

### g) Remove # from the tweets while retaining the text that follows it

In [16]:
text_cleanup4=[[word[1:] if word.startswith('#') else word for word in tweet] for tweet in text_cleanup3]
print(text_cleanup4[:5])

[['father', 'dysfunctional', 'selfish', 'drags', 'kids', 'dysfunction', '.', 'run'], ['thanks', 'lyft', 'credit', "can't", 'use', 'cause', 'offer', 'wheelchair', 'vans', 'pdx', '.', 'disapointed', 'getthanked'], ['bihday', 'majesty'], ['model', 'love', 'u', 'take', 'u', 'time', 'urð', '\x9f', '\x93', '±', '!', '!', '!', 'ð', '\x9f', '\x98', '\x99', 'ð', '\x9f', '\x98', '\x8e', 'ð', '\x9f', '\x91', '\x84', 'ð', '\x9f', '\x91', 'ð', '\x9f', '\x92', '¦', 'ð', '\x9f', '\x92', '¦', 'ð', '\x9f', '\x92', '¦'], ['factsguide', ':', 'society', 'motivation']]


### 4. Use the cleanup process to remove terms with a length of 1

In [17]:
text_cleanup5=[[word for word in tweet if len(word)>1] for tweet in text_cleanup4]
print(text_cleanup5[:5])

[['father', 'dysfunctional', 'selfish', 'drags', 'kids', 'dysfunction', 'run'], ['thanks', 'lyft', 'credit', "can't", 'use', 'cause', 'offer', 'wheelchair', 'vans', 'pdx', 'disapointed', 'getthanked'], ['bihday', 'majesty'], ['model', 'love', 'take', 'time', 'urð'], ['factsguide', 'society', 'motivation']]


### 5. Check the top terms in the tweets:

a) First, get all tokenized terms into one list

b) Use the counter, and find the 10 most common terms

In [18]:
from collections import Counter
all_words=list([word for tweet in text_cleanup5 for word in tweet ])
common_words=Counter(all_words).most_common(10)
print(common_words)

[('...', 2810), ('love', 2748), ('day', 2276), ('happy', 1684), ('time', 1131), ('life', 1118), ('like', 1047), ('today', 1013), ('new', 994), ('thankful', 946)]


### 6. Format the data for predictive modeling:
                  
a) Join the tokens back to form strings, which will be required for the
vectorizers

In [19]:
tweets_cleaned=[(' ').join(tweet) for tweet in text_cleanup5]
tweets_cleaned[:5]

['father dysfunctional selfish drags kids dysfunction run',
 "thanks lyft credit can't use cause offer wheelchair vans pdx disapointed getthanked",
 'bihday majesty',
 'model love take time urð',
 'factsguide society motivation']

#### b) Assign x and y

In [20]:
X=tweets_cleaned
y=df['label']

#### c) Perform train_test_split using sklearn

In [21]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,stratify=y,train_size=0.2,random_state=42)

### 7.Use TF IDF values for the terms as a feature to get into a vector space model
a) Import TF IDF vectorizer from sklearn

b) Instantiate the model with a maximum of 5000 terms in your vocabulary

c) Fit and apply the vector space model on the train set

d) Apply the model on the test set

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(max_features=5000)
X_train_tfidf=tfidf.fit_transform(X_train)
X_test_tfidf=tfidf.transform(X_test)

### 8. Model building: ordinary logistic regression
a) Instantiate logistic regression from sklearn with default parameters

b) Fit model on the train data

c) Make predictions for the train and the test sets

In [23]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(X_train_tfidf,y_train)
y_train_pred=lr.predict(X_train_tfidf)
y_test_pred=lr.predict(X_test_tfidf)

### 9. Model evaluation: accuracy, recall, and f_1 score
a) Report the accuracy of the train set

b) Report the recall on the train set: decent, high, or low

c) Get the f_1 score on the train set

In [24]:
from sklearn.metrics import accuracy_score,recall_score,f1_score
print("Train accuracy", accuracy_score(y_train,y_train_pred))
print("Train recall", recall_score(y_train,y_train_pred))
print("Train f1", f1_score(y_train,y_train_pred))

Train accuracy 0.9392991239048811
Train recall 0.13839285714285715
Train f1 0.2421875


### 10. Adjust the class imbalance, if any

a) Adjust the appropriate class in the logistic regression model

In [42]:
## Class imbalance happens when your dataset contains many more examples of one class than another.
lr_balanced=Log isticRegression(class_weight='balanced') 
lr_balanced.fit(X_train_tfidf,y_train)
y_train_pred_bal=lr_balanced.predict(X_train_tfidf)

### 11. Train the model again with the adjustment and evaluate
a) Train the model on the train set

b) Evaluate the predictions on the train set: accuracy, recall, and f_1 score


In [26]:
print("Balanced Train accuracy", accuracy_score(y_train,y_train_pred_bal))
print("Balanced Train recall", recall_score(y_train,y_train_pred_bal))
print("Balanced Train f1", f1_score(y_train,y_train_pred_bal))

Balanced Train accuracy 0.9740300375469336
Balanced Train recall 0.9977678571428571
Balanced Train f1 0.8433962264150944


### 12. Regularization and hyperparameter tuning:

a) Import GridSearch and StratifiedKFold

b) Choose for C and penalty parameters under the parameters grid

c) Use a balanced class weight while instantiating the logistic regression

### 13. Find the parameters with the best recall in cross validation

a) Chooserecall as the metric for scoring

b) Choose a stratified four fold cross validation scheme

c) Fit it on the train set

In [35]:
from sklearn.model_selection import GridSearchCV,StratifiedKFold
param_grid={
    'C':[0.01,0.1,1,10],        ## Inverse of the regularization strength
    'penalty':['l1','l2'],      ## Type of regularisation
    'solver':['liblinear']}      ## Optimization algorithm used to minimize the loss function in Logistic Regression
skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)  ##It ensures that each fold used during cv has the same class distribution as the original dataset.
lr_tune = LogisticRegression(class_weight='balanced')
grid = GridSearchCV(lr_tune, param_grid, scoring='recall', cv=skf)
grid.fit(X_train_tfidf, y_train)

### 14. List the best parameters

In [39]:
print("Best parameters: " , grid.best_params_)

Best parameters:  {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}


### 15. Predict and evaluate parameters using the best estimator

a) Use the best estimator from the grid search to make predictions on the test set

b) Find the recall on the test set for the toxic comments

c) Find the f_1 score

In [41]:
best_model=grid.best_estimator_
y_test_final=best_model.predict(X_test_tfidf)
print("Test recall:", recall_score(y_test,y_test_final))
print("Test f1_score:", f1_score(y_test,y_test_final))

Test recall: 0.6404682274247492
Test f1_score: 0.5805962607377463
