In [20]:
import pandas as pd
import numpy as np
import re
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("cyberbullying_tweets.csv")
af = pd.read_csv("cyberbullying_tweets.csv")

In [3]:
print(df.head())

                                          tweet_text cyberbullying_type
0  In other words #katandandre, your food was cra...  not_cyberbullying
1  Why is #aussietv so white? #MKR #theblock #ImA...  not_cyberbullying
2  @XochitlSuckkks a classy whore? Or more red ve...  not_cyberbullying
3  @Jason_Gio meh. :P  thanks for the heads up, b...  not_cyberbullying
4  @RudhoeEnglish This is an ISIS account pretend...  not_cyberbullying


In [4]:
df = df.applymap(lambda x: ' '.join(word for word in str(x).split() if not word.startswith('@')))

print(f"Words starting with '@' removed successfully and saved")


Words starting with '@' removed successfully and saved


In [5]:
ampersand_pattern = re.compile(r'&\S*')

# Remove strings starting with '&' from all string columns
df = df.applymap(lambda x: ampersand_pattern.sub('', str(x)))


In [6]:
# Define a regular expression pattern to match web links
link_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
# Remove web links from all columns
df = df.applymap(lambda x: re.sub(link_pattern, '', str(x)))



In [7]:
www_pattern = re.compile(r'www\.[^\s]+')
df = df.applymap(lambda x: re.sub(www_pattern, '', str(x)))


In [8]:
df = df.applymap(lambda x: ''.join(char for char in str(x) if not char.isdigit()))


In [9]:
df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)


In [10]:
df = df.applymap(lambda x: x.replace("rt", "") if isinstance(x, str) else x)


In [11]:
#includes emoticons
df = df.applymap(lambda x: re.sub(r'[^\w\s]', '', str(x)) if isinstance(x, str) else x)


In [12]:
custom_stopwords = set([
            "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves",
            "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their",
            "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was",
            "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and",
            "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between",
            "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off",
            "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both",
            "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very",
            "s", "t", "can", "will", "just", "don", "should", "now", "d", "ll", "m", "o", "re", "ve", "y", "ain", "aren", "couldn", "didn",
            "doesn", "hadn", "hasn", "haven", "isn", "ma", "mightn", "mustn", "needn", "shan", "shouldn", "wasn", "weren", "won", "wouldn"
        ])
df = df.applymap(lambda x: ' '.join(word for word in str(x).split() if word.lower() not in custom_stopwords))


In [13]:
custom_abbrevations = set([
            "lol", "omg", "lmao", "imo", "btw", "idk", "tbh", "rn", "thx", "brb", "asap", "aka", "irl", "smh", "tldr", "plz", "bc", "ily"
        ])
df = df.applymap(lambda x: ' '.join(word for word in str(x).split() if word.lower() not in custom_stopwords))

df

Unnamed: 0,tweet_text,cyberbullying_type
0,words katandandre food crapilicious mkr,not_cyberbullying
1,aussietv white mkr theblock imacelebrityau tod...,not_cyberbullying
2,classy whore red velvet cupcakes,not_cyberbullying
3,meh p thanks heads concerned another angry dud...,not_cyberbullying
4,isis account pretending kurdish account like i...,not_cyberbullying
...,...,...
47687,black ppl arent expected anything depended any...,ethnicity
47688,turner withhold disappointment turner called c...,ethnicity
47689,swear god dumb nigger bitch got bleach hair re...,ethnicity
47690,yea fuck youre nigger fucking unfollow fucking...,ethnicity


In [14]:
print("Before Transformation:")
print(df['cyberbullying_type'].unique())

df['cyberbullying_type'] = df['cyberbullying_type'].apply(lambda x: 1 if str(x).lower() == 'not_cyberbullying' else 0)

print("After Transformation:")
print(df['cyberbullying_type'].unique())

df.to_csv("Final.csv", index=False)

Before Transformation:
['not_cyberbullying' 'gender' 'religion' 'other_cyberbullying' 'age'
 'ethnicity']
After Transformation:
[1 0]


In [15]:
print(df.isnull().sum())
df = df.dropna()
print(df.describe())

#Encoding
label_encoder = LabelEncoder()
df['cyberbullying_type']= label_encoder.fit_transform(df['cyberbullying_type'])
df['tweet_text']= label_encoder.fit_transform(df['tweet_text'])
                                                      
X = df[['tweet_text']]
y = df['cyberbullying_type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of Y_train:", y_train.shape)
print("Shape of Y_test:", y_test.shape)

tweet_text            0
cyberbullying_type    0
dtype: int64
       cyberbullying_type
count        47692.000000
mean             0.166590
std              0.372613
min              0.000000
25%              0.000000
50%              0.000000
75%              0.000000
max              1.000000
Shape of X_train: (38153, 1)
Shape of X_test: (9539, 1)
Shape of Y_train: (38153,)
Shape of Y_test: (9539,)


In [16]:
#Model
#Smaller C values might be preferred when there is noise in the data or when a simpler decision boundary is desired.
#Larger C values may be suitable when the training data is well-behaved and a more complex decision boundary is needed.
param_grid = {'C': [0.1, 1, 10, 100]}


clf = SVC(kernel='linear')
grid_search = GridSearchCV(clf, param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_C = grid_search.best_params_['C']
print("Best C:", best_C)




Best C: 0.1


In [17]:
best_svm_classifier = SVC(kernel='linear', C=best_C)
best_svm_classifier.fit(X_train, y_train)

In [1]:
y_pred = best_svm_classifier.predict(X_test)

# Evaluate the model's performance
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred))

NameError: name 'best_svm_classifier' is not defined