In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import re
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv("cyberbullying_tweets.csv")
my_tags = ['religion','age','gender','ethnicity','not_cyberbullying','other_cyberbullying']

In [21]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['tweet_text'])
Y = df['cyberbullying_type']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [23]:
ridge_classifier = RidgeClassifier()

In [24]:
param_grid = {
    'alpha': [0.01, 0.1, 1.0],  # Regularization strength
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

In [25]:
grid_search = GridSearchCV(ridge_classifier, param_grid, cv=5)
grid_search.fit(X_train, y_train)

45 fits failed out of a total of 105.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Adeel\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Adeel\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Adeel\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py", line 1446, in fit
    super().fit(X, Y, sample_weight=sample_weight)
  File "C:\Users\Adeel\anaconda3\Lib\site-packages\sklearn\linear_model

In [28]:
best_params = grid_search.best_params_
best_params

{'alpha': 1.0, 'solver': 'auto'}

In [29]:
best_ridge_classifier = grid_search.best_estimator_
best_ridge_classifier

In [37]:
ridge_classifier.fit(X_train, y_train)

In [39]:
y_pred=ridge_classifier.predict(X_test)
y_pred

array(['ethnicity', 'gender', 'ethnicity', ..., 'age', 'ethnicity',
       'other_cyberbullying'], dtype='<U19')

In [30]:
accuracy = best_ridge_classifier.score(X_test, y_test)
accuracy

0.8254534018240905

In [40]:
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))

accuracy 0.8254534018240905
                     precision    recall  f1-score   support

           religion       0.95      0.98      0.96      1603
                age       0.98      0.97      0.97      1603
             gender       0.90      0.85      0.88      1531
          ethnicity       0.59      0.56      0.57      1624
  not_cyberbullying       0.61      0.64      0.62      1612
other_cyberbullying       0.93      0.96      0.94      1566

           accuracy                           0.83      9539
          macro avg       0.83      0.83      0.83      9539
       weighted avg       0.82      0.83      0.82      9539

