# Classifing hate speech in tweets using Support Vector Machine 

In [1]:
import pandas as pd
import requests
import io
import sklearn.model_selection as ms
from sklearn import svm
from sklearn.metrics import classification_report, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV


In [2]:
# Downloading the csv file from GitHub
url = "https://raw.githubusercontent.com/Aaron9812/Data_mining/main/data/220505_train_data_preprocessed.csv"
download = requests.get(url).content

# Reading the downloaded content and turning it into a pandas dataframe
df = pd.read_csv(io.StringIO(download.decode('utf-8')), sep=";")

# Printing out the first row of the dataframe
print(df.columns)

Index(['id', 'label', 'tweet', 'n_mentions', 'hashtags', 'without_puctioation',
       'tweet_lower', 'tweet_token', 'clean_token', 'clean_hashtags',
       'stemmed_tokens', 'stemmed_hashtags', 'lemmatized_tokens',
       'lemmatized_hashtags', 'tfidf_stemmed_tokens', 'tfidf_stemmed_hashtags',
       'tfidf_lemmatized_tokens', 'tfidf_lemmatized_hashtags'],
      dtype='object')


In [3]:
features = [
"tweet",
"hashtags",
"without_puctioation",
"tweet_lower",
"tweet_token",
"clean_token",
"clean_hashtags",
"stemmed_tokens",
"stemmed_hashtags",
"lemmatized_tokens",
"lemmatized_hashtags"
]

### only text features used (so far); no numerical features included!!

In [4]:
X = df[features]
y = df.label
X.head()


Unnamed: 0,tweet,hashtags,without_puctioation,tweet_lower,tweet_token,clean_token,clean_hashtags,stemmed_tokens,stemmed_hashtags,lemmatized_tokens,lemmatized_hashtags
0,best #lawofattraction #resources for #healing!...,"['lawofattraction', 'resources', 'healing', 'a...",best lawofattraction resources for healing ...,best lawofattraction resources for healing ...,"['lawofattraction', 'for', 'altwaystoheal', 'is']","['lawofattraction', 'altwaystoheal']","['lawofattraction', 'resources', 'healing', 'a...","['lawofattract', 'altwaystoh']","['lawofattract', 'resourc', 'heal', 'altwaysto...","['lawofattraction', 'altwaystoheal']","['lawofattraction', 'resource', 'healing', 'al..."
1,remembering to focus on the simplest happy mom...,"['blogger', 'blog', 'life']",remembering to focus on the simplest happy mom...,remembering to focus on the simplest happy mom...,"['to', 'on', 'simplest', 'moments', 'life', 'b...","['simplest', 'moments', 'life', 'blogger', 'li...","['blogger', 'blog', 'life']","['simplest', 'moment', 'life', 'blogger', 'life']","['blogger', 'blog', 'life']","['simplest', 'moment', 'life', 'blogger', 'life']","['blogger', 'blog', 'life']"
2,when you get as happy as your boyfriend to be ...,['silvia'],when you get as happy as your boyfriend to be ...,when you get as happy as your boyfriend to be ...,"['you', 'as', 'as', 'boyfriend', 'be', 'with',...","['boyfriend', 'car']",['silvia'],"['boyfriend', 'car']",['silvia'],"['boyfriend', 'car']",['silvia']
3,why do you always try to make me happy? i don...,"['love', 'devotion']",why do you always try to make me happy i dont...,why do you always try to make me happy i dont...,"['do', 'always', 'to', 'me', 'i', 'know', 'to'...","['always', 'know', 'love']","['love', 'devotion']","['alway', 'know', 'love']","['love', 'devot']","['always', 'know', 'love']","['love', 'devotion']"
4,omg is finally here!!! #ps4 #farcry4 #gtav #un...,"['ps4', 'farcry4', 'gtav', 'unchaed4']",omg is finally here ps4 farcry4 gtav unchaed4,omg is finally here ps4 farcry4 gtav unchaed4,"['is', 'here', 'farcry4', 'unchaed4']","['farcry4', 'unchaed4']","['ps4', 'farcry4', 'gtav', 'unchaed4']","['farcry4', 'unchaed4']","['ps4', 'farcry4', 'gtav', 'unchaed4']","['farcry4', 'unchaed4']","['ps4', 'farcry4', 'gtav', 'unchaed4']"


In [5]:
# Source used:
# https://medium.com/@vasista/sentiment-analysis-using-svm-338d418e3ff1
'''
Train-Test-Split (source vs. code here)

trainData --> X_train
testData --> X_test
trainData['Label'] --> y_train
testData['Label'] --> y_test
'''

"\nTrain-Test-Split (source vs. code here)\n\ntrainData --> X_train\ntestData --> X_test\ntrainData['Label'] --> y_train\ntestData['Label'] --> y_test\n"

In [5]:
(X_train, X_test, y_train, y_test) = ms.train_test_split(X, y, test_size=0.2, random_state = 17, stratify=y)

In [7]:
print("# rows dataset: ", len(df))
print("X_train:", len(X_train),  "   X_test:", len(X_test), "   y_train:",  len(y_train), "   y_test:", len(y_test))
print("X_train + X_test =", (len(X_train) + len(X_test)))

# rows dataset:  6393
X_train: 5114    X_test: 1279    y_train: 5114    y_test: 1279
X_train + X_test = 6393


In [7]:
# Create feature vectors
vectorizer = TfidfVectorizer()

# C-Support Vector Classification

Using each column indiviudally from our pre-processed data in order to find best kernel for the SVC.

In [8]:
kernels = ["linear", "rbf", "poly", "sigmoid"]

In [9]:
feature_selection = [
"tweet"
]

In [11]:
for kernel in kernels:
    print("f1 scores with ", kernel, "kernel \n")
    for feature in feature_selection:
        vectors_train = vectorizer.fit_transform(X_train[feature])
        vectors_test = vectorizer.transform(X_test[feature])
        # Perform classification with SVM
        classifier = svm.SVC(kernel= kernel)
        classifier.fit(vectors_train, y_train)

        prediction = classifier.predict(vectors_test)

        # results
        """
        # alternative: print reports
        report = classification_report(y_test, prediction, output_dict=True)
        # print(feature,": ", report['1'])
        """
        f1 = f1_score(y_test, prediction)
        print(feature,": ", f1)
    print("____________________________________________ \n")

f1 scores with  linear kernel 

tweet :  0.4640000000000001
____________________________________________ 

f1 scores with  rbf kernel 

tweet :  0.21359223300970873
____________________________________________ 

f1 scores with  poly kernel 

tweet :  0.19607843137254902
____________________________________________ 

f1 scores with  sigmoid kernel 

tweet :  0.4166666666666667
____________________________________________ 



### SCV with balanced class weights

In [12]:
for kernel in kernels:
    print("f1 scores with ", kernel, "kernel \n")
    for feature in feature_selection:
        vectors_train = vectorizer.fit_transform(X_train[feature])
        vectors_test = vectorizer.transform(X_test[feature])
        # Perform classification with SVM
        classifier = svm.SVC(kernel= kernel, class_weight='balanced')
        classifier.fit(vectors_train, y_train)

        prediction = classifier.predict(vectors_test)

        # results
        """
        # alternative: print reports
        report = classification_report(y_test, prediction, output_dict=True)
        # print(feature,": ", report['1'])
        """
        f1 = f1_score(y_test, prediction)
        print(feature,": ", f1)
    print("____________________________________________ \n")

f1 scores with  linear kernel 

tweet :  0.632183908045977
____________________________________________ 

f1 scores with  rbf kernel 

tweet :  0.4552845528455284
____________________________________________ 

f1 scores with  poly kernel 

tweet :  0.2476190476190476
____________________________________________ 

f1 scores with  sigmoid kernel 

tweet :  0.608695652173913
____________________________________________ 



### SCV with scaled vectors & balanced class weights

In [11]:
from sklearn.preprocessing import StandardScaler
vectorizer = TfidfVectorizer()
scaler = StandardScaler(with_mean=False)

vec_train = vectorizer.fit_transform(X_train["tweet"])
vec_test = vectorizer.transform(X_test["tweet"])
vectors_train = scaler.fit_transform(vec_train)
vectors_test = scaler.transform(vec_test)

#vectors_train = scaler.transform(vectors_train)
#vectors_test = scaler.transform(vectors_test)


In [12]:
for kernel in kernels:
    print("f1 scores with ", kernel, "kernel \n")
    for feature in feature_selection:
        # Perform classification with SVM
        classifier = svm.SVC(kernel= kernel, class_weight='balanced')
        classifier.fit(vectors_train, y_train)

        prediction = classifier.predict(vectors_test)

        # results
        f1 = f1_score(y_test, prediction)
        print(feature,": ", f1)
    print("____________________________________________ \n")

f1 scores with  linear kernel 

tweet :  0.45925925925925926
____________________________________________ 

f1 scores with  rbf kernel 

tweet :  0.29629629629629634
____________________________________________ 

f1 scores with  poly kernel 

tweet :  0.17821782178217824
____________________________________________ 

f1 scores with  sigmoid kernel 

tweet :  0.5359477124183005
____________________________________________ 



## Using Grid Search for "tweet" with linear kernel

In [None]:
param_grid={'C': [0.001,0.01,0.1,1,10,100,1000],  
            'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
            'kernel': ["linear", "rbf", "poly", "sigmoid"],
            'class_weight': ["balanced"]}
start_time = time.time()
vectors_train = vectorizer.fit_transform(X_train["tweet"])
vectors_test = vectorizer.transform(X_test["tweet"])

svm_estimator = svm.SVC()

svm_cv = GridSearchCV(svm_estimator, param_grid, cv=3)
svm_cv.fit(vectors_train, y_train)

In [None]:
print("--- %s seconds ---" % (time.time() - start_time))
print("f1 for ", f1_score(y_test, svm_cv.predict(vectors_test)))

### Results (run in Colab)
--- 905.0211062431335 seconds ---
f1 for  0.6206896551724138

In [13]:
"""
# Result:
GridSearchCV(cv=3, estimator=SVC(),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         'class_weight': ['balanced'],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['linear', 'rbf', 'poly', 'sigmoid']})
--- 905.0211062431335 seconds ---
f1 for  0.6206896551724138
"""

"\nGridSearchCV(cv=3, estimator=SVC(),\n             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],\n                         'class_weight': ['balanced'],\n                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],\n                         'kernel': ['linear', 'rbf', 'poly', 'sigmoid']})\n--- 905.0211062431335 seconds ---\nf1 for  0.6206896551724138\n"

## <span style="color: blue;"> to do:</span>

- <span style="color: blue;"> Why best result here worse than "tweet", with linear kernel?</span>
- <span style="color: blue;"> remove stop words</span>
- <span style="color: blue;"> Finding out which parameters were most successful</span>
- <span style="color: blue;"> add Confusion Matrix</span>

## <span style="color: blue;"> ideas:</span>

- <span style="color: blue;"> Can we see those examples that were categorized wrong? </span>

## keeping track of results

- First SVM (linear) with f1 score for the class "hate-speech" around 0.3 for "tweets" and most pre-preprocessed text. Lemmatization and stemming do not or only slightly improve results. 
-Testing out different kernels gives worse results for "rbf" and "poly". "sigmoid" produces similar results.
- --> !!! vectors not scaled --> bad performance of some of the models?
- using weighted classes gives f1 scores up to 0.62
- using GridSearch for the least processed column "tweets" gives f1 score of 0.59 (parameters C, kernel and class_weight); using more-preprocessing beforehand might improve results
- meaning of parameters:
    - gamma:tries to fit non-linear data (maybe only useful for some of the "features"?
    - C: balancing out model complexity and training errors
    - weight-class:
    - summary of characters of SVM: pp. 496-498 (Tan et. al.), e.g. robust to noise in comparison to decision tree
- scaling parameters: f1 down to 50% (for "tweet")