In [None]:
import nltk
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import string
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
tweets = pd.read_csv('/content/drive/MyDrive/ML/train_data3v2.csv')
tweets.head()

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3.0,0.0,0.0,3.0,Safe_Speech,!!! RT @mayasolovely: As a woman you shouldn't...
1,3.0,0.0,3.0,0.0,Offensive_Speech,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3.0,0.0,3.0,0.0,Offensive_Speech,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3.0,0.0,2.0,1.0,Offensive_Speech,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6.0,0.0,6.0,0.0,Offensive_Speech,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [None]:
tweets.shape

(44728, 6)

In [None]:
tweets.rename(columns = {'class':'Label'}, inplace = True)

In [None]:
tweets = tweets[['Label','tweet']]

In [None]:
tweets['Label'].value_counts()

Offensive_Speech    19190
Safe_Speech         13882
Hate_Speech         11656
Name: Label, dtype: int64

In [None]:
tweets.isna().sum().sum()

0

In [None]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44728 entries, 0 to 44727
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Label   44728 non-null  object
 1   tweet   44728 non-null  object
dtypes: object(2)
memory usage: 699.0+ KB


In [None]:
tweets.describe()

Unnamed: 0,Label,tweet
count,44728,44728
unique,3,42852
top,Offensive_Speech,#model i love u take with u all the time in ...
freq,19190,115


In [None]:
def text_process(text):
    pattern = r'https?://\S+|www\.\S+'  
    text = re.sub(pattern, '', text)
    text = text.lower()
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)  
    text = re.sub(r'#', '', text)  
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'ð', '', text)
    text = re.sub(r'rt', '', text)
    text = re.sub(r'RT', '', text)
    text = re.sub(r'!', '', text)
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    word = ' '.join([word for word in nopunc.split() if word.lower() not in stopwords.words('english')])
    return word

In [None]:
tweets['tweet']=tweets['tweet'].apply(text_process)

In [None]:
tweets.head()

Unnamed: 0,Label,tweet
0,Safe_Speech,woman shouldnt complain cleaning house amp man...
1,Offensive_Speech,boy dats coldtyga dwn bad cuffin dat hoe st place
2,Offensive_Speech,dawg ever fuck bitch sta cry confused shit
3,Offensive_Speech,look like tranny
4,Offensive_Speech,shit hear might true might faker bitch told ya


In [None]:
Tweet_train, Tweet_test, Label_train, Label_test = train_test_split(tweets['tweet'], 
                                                                tweets['Label'], test_size = 0.2, random_state = 42)

In [None]:
from sklearn.linear_model import LogisticRegression

# Defining the pipeline
logreg = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', LogisticRegression(random_state = 42))
])

# Fitting the pipeline on the training data
logreg.fit(Tweet_train, Label_train)

# Predicting on the test data and evaluating the model
PP = logreg.predict(Tweet_test)
print(classification_report(Label_test, PP))


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search over
param_grid = {
    'tfidf__max_features': [5000, 10000, 30000],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__C': [0.01, 0.1, 1, 10, 100]
}

# Perform grid search cross-validation
grid = GridSearchCV(logreg, param_grid, cv=5, n_jobs=-1)
grid.fit(Tweet_train, Label_train)


In [None]:

# Print the best parameters and score
print("Best parameters: ", grid.best_params_)
print("Best score: ", grid.best_score_)

# Predict on the test data and evaluate the model
new = grid.predict(Tweet_test)
print(classification_report(Label_test, new))


Best parameters:  {'classifier__C': 10, 'classifier__penalty': 'l2', 'tfidf__max_features': 30000, 'tfidf__ngram_range': (1, 2)}
Best score:  0.9007881248119223
                  precision    recall  f1-score   support

     Hate_Speech       0.89      0.85      0.87      2338
Offensive_Speech       0.92      0.93      0.93      3838
     Safe_Speech       0.89      0.92      0.91      2770

        accuracy                           0.90      8946
       macro avg       0.90      0.90      0.90      8946
    weighted avg       0.90      0.90      0.90      8946



In [None]:
import pickle

# save the pipeline to a file
with open('grid.pkl', 'wb') as file:
    pickle.dump(grid, file)