# Classifing hate speech in tweets using Support Vector Machine 

In [1]:
import pandas as pd
import requests
import io
import sklearn.model_selection as ms
from sklearn import svm
from sklearn.metrics import classification_report, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

In [2]:
# Downloading the csv file from GitHub
url = "https://raw.githubusercontent.com/Aaron9812/Data_mining/main/data/220505_train_data_preprocessed.csv"
download = requests.get(url).content

# Reading the downloaded content and turning it into a pandas dataframe
df = pd.read_csv(io.StringIO(download.decode('utf-8')), sep=";")

# Printing out the first row of the dataframe
print(df.columns)

Index(['id', 'label', 'tweet', 'n_mentions', 'hashtags', 'without_puctioation',
       'tweet_lower', 'tweet_token', 'clean_token', 'clean_hashtags',
       'stemmed_tokens', 'stemmed_hashtags', 'lemmatized_tokens',
       'lemmatized_hashtags', 'tfidf_stemmed_tokens', 'tfidf_stemmed_hashtags',
       'tfidf_lemmatized_tokens', 'tfidf_lemmatized_hashtags'],
      dtype='object')


In [3]:
features = [
"tweet",
"hashtags",
"without_puctioation",
"tweet_lower",
"tweet_token",
"clean_token",
"clean_hashtags",
"stemmed_tokens",
"stemmed_hashtags",
"lemmatized_tokens",
"lemmatized_hashtags"
]

### only text features used (so far); no numerical features included!!

In [4]:
X = df[features]
y = df.label
X.head()


Unnamed: 0,tweet,hashtags,without_puctioation,tweet_lower,tweet_token,clean_token,clean_hashtags,stemmed_tokens,stemmed_hashtags,lemmatized_tokens,lemmatized_hashtags
0,best #lawofattraction #resources for #healing!...,"['lawofattraction', 'resources', 'healing', 'a...",best lawofattraction resources for healing ...,best lawofattraction resources for healing ...,"['lawofattraction', 'for', 'altwaystoheal', 'is']","['lawofattraction', 'altwaystoheal']","['lawofattraction', 'resources', 'healing', 'a...","['lawofattract', 'altwaystoh']","['lawofattract', 'resourc', 'heal', 'altwaysto...","['lawofattraction', 'altwaystoheal']","['lawofattraction', 'resource', 'healing', 'al..."
1,remembering to focus on the simplest happy mom...,"['blogger', 'blog', 'life']",remembering to focus on the simplest happy mom...,remembering to focus on the simplest happy mom...,"['to', 'on', 'simplest', 'moments', 'life', 'b...","['simplest', 'moments', 'life', 'blogger', 'li...","['blogger', 'blog', 'life']","['simplest', 'moment', 'life', 'blogger', 'life']","['blogger', 'blog', 'life']","['simplest', 'moment', 'life', 'blogger', 'life']","['blogger', 'blog', 'life']"
2,when you get as happy as your boyfriend to be ...,['silvia'],when you get as happy as your boyfriend to be ...,when you get as happy as your boyfriend to be ...,"['you', 'as', 'as', 'boyfriend', 'be', 'with',...","['boyfriend', 'car']",['silvia'],"['boyfriend', 'car']",['silvia'],"['boyfriend', 'car']",['silvia']
3,why do you always try to make me happy? i don...,"['love', 'devotion']",why do you always try to make me happy i dont...,why do you always try to make me happy i dont...,"['do', 'always', 'to', 'me', 'i', 'know', 'to'...","['always', 'know', 'love']","['love', 'devotion']","['alway', 'know', 'love']","['love', 'devot']","['always', 'know', 'love']","['love', 'devotion']"
4,omg is finally here!!! #ps4 #farcry4 #gtav #un...,"['ps4', 'farcry4', 'gtav', 'unchaed4']",omg is finally here ps4 farcry4 gtav unchaed4,omg is finally here ps4 farcry4 gtav unchaed4,"['is', 'here', 'farcry4', 'unchaed4']","['farcry4', 'unchaed4']","['ps4', 'farcry4', 'gtav', 'unchaed4']","['farcry4', 'unchaed4']","['ps4', 'farcry4', 'gtav', 'unchaed4']","['farcry4', 'unchaed4']","['ps4', 'farcry4', 'gtav', 'unchaed4']"


In [5]:
# Source used:
# https://medium.com/@vasista/sentiment-analysis-using-svm-338d418e3ff1
'''
Train-Test-Split (source vs. code here)

trainData --> X_train
testData --> X_test
trainData['Label'] --> y_train
testData['Label'] --> y_test
'''

"\nTrain-Test-Split (source vs. code here)\n\ntrainData --> X_train\ntestData --> X_test\ntrainData['Label'] --> y_train\ntestData['Label'] --> y_test\n"

In [6]:
(X_train, X_test, y_train, y_test) = ms.train_test_split(X, y, test_size=0.2, random_state = 17, stratify=y)

In [7]:
print("# rows dataset: ", len(df))
print("X_train:", len(X_train),  "   X_test:", len(X_test), "   y_train:",  len(y_train), "   y_test:", len(y_test))
print("X_train + X_test =", (len(X_train) + len(X_test)))

# rows dataset:  6393
X_train: 5114    X_test: 1279    y_train: 5114    y_test: 1279
X_train + X_test = 6393


# C-Support Vector Classification

Using each column indiviudally from our pre-processed data in order to find best kernel for the SVC.

### SCV with equal class weights

In [8]:
# Create feature vectors
vectorizer = TfidfVectorizer()

In [9]:
kernels = ["linear", "rbf", "poly", "sigmoid"]

In [10]:
for kernel in kernels:
    print("f1 scores with ", kernel, "kernel \n")
    for feature in features:
        vectors_train = vectorizer.fit_transform(X_train[feature])
        vectors_test = vectorizer.transform(X_test[feature])
        # Perform classification with SVM
        classifier = svm.SVC(kernel= kernel)
        classifier.fit(vectors_train, y_train)

        prediction = classifier.predict(vectors_test)

        # results
        """
        # alternative: print reports
        report = classification_report(y_test, prediction, output_dict=True)
        # print(feature,": ", report['1'])
        """
        f1 = f1_score(y_test, prediction)
        print(feature,": ", f1)
    print("____________________________________________ \n")

f1 scores with  linear kernel 

tweet :  0.4640000000000001
hashtags :  0.4406779661016949
without_puctioation :  0.4715447154471545
tweet_lower :  0.4715447154471545
tweet_token :  0.3728813559322034
clean_token :  0.35
clean_hashtags :  0.4406779661016949
stemmed_tokens :  0.3620689655172414
stemmed_hashtags :  0.4878048780487805
lemmatized_tokens :  0.3529411764705882
lemmatized_hashtags :  0.46280991735537186
____________________________________________ 

f1 scores with  rbf kernel 

tweet :  0.19607843137254902
hashtags :  0.34234234234234234
without_puctioation :  0.19607843137254902
tweet_lower :  0.19607843137254902
tweet_token :  0.2476190476190476
clean_token :  0.2452830188679245
clean_hashtags :  0.34234234234234234
stemmed_tokens :  0.2616822429906542
stemmed_hashtags :  0.3571428571428571
lemmatized_tokens :  0.27777777777777773
lemmatized_hashtags :  0.3571428571428571
____________________________________________ 

f1 scores with  poly kernel 

tweet :  0.196078431372549

## <span style="color: blue;"> to do:</span>
 <span style="color: blue;"> scaling vectors to improve models for some of the kernels?</span>

### SCV with balanced class weights

In [11]:
for kernel in kernels:
    print("f1 scores with ", kernel, "kernel \n")
    for feature in features:
        vectors_train = vectorizer.fit_transform(X_train[feature])
        vectors_test = vectorizer.transform(X_test[feature])
        # Perform classification with SVM
        classifier = svm.SVC(kernel= kernel, class_weight='balanced')
        classifier.fit(vectors_train, y_train)

        prediction = classifier.predict(vectors_test)

        # results
        """
        # alternative: print reports
        report = classification_report(y_test, prediction, output_dict=True)
        # print(feature,": ", report['1'])
        """
        f1 = f1_score(y_test, prediction)
        print(feature,": ", f1)
    print("____________________________________________ \n")

f1 scores with  linear kernel 

tweet :  0.6206896551724138
hashtags :  0.48529411764705876
without_puctioation :  0.6395348837209303
tweet_lower :  0.6395348837209303
tweet_token :  0.5053763440860215
clean_token :  0.4972375690607735
clean_hashtags :  0.48529411764705876
stemmed_tokens :  0.5154639175257731
stemmed_hashtags :  0.5106382978723404
lemmatized_tokens :  0.5212765957446808
lemmatized_hashtags :  0.4892086330935252
____________________________________________ 

f1 scores with  rbf kernel 

tweet :  0.46031746031746035
hashtags :  0.3793103448275862
without_puctioation :  0.47244094488188976
tweet_lower :  0.47244094488188976
tweet_token :  0.3851851851851852
clean_token :  0.3664921465968587
clean_hashtags :  0.3793103448275862
stemmed_tokens :  0.3522727272727273
stemmed_hashtags :  0.4067796610169491
lemmatized_tokens :  0.37837837837837834
lemmatized_hashtags :  0.3931623931623932
____________________________________________ 

f1 scores with  poly kernel 

tweet :  0.24

## Usin Grid Search for "tweet"

In [12]:
X_train_grid = vectorizer.fit_transform(X_train["tweet"])
X_test_grid = vectorizer.transform(X_test["tweet"])

svm_estimator = svm.SVC()

svm = GridSearchCV(estimator=svm_estimator,
             param_grid={'C': [1, 10, ], 'kernel': ("linear", "rbf", "poly", "sigmoid"), 'class_weight': (None, "balanced")})
svm.fit(X_train_grid, y_train)


GridSearchCV(estimator=SVC(),
             param_grid={'C': [1, 10], 'class_weight': (None, 'balanced'),
                         'kernel': ('linear', 'rbf', 'poly', 'sigmoid')})

In [13]:
f1_score(y_test, svm.predict(X_test_grid))

0.5935483870967742

## <span style="color: blue;"> to do:</span>

- <span style="color: blue;"> Why best result here worse than "tweet", with linear kernel?</span>
- <span style="color: blue;"> Finding out which parameters were most successful</span>
- <span style="color: blue;"> add Confusion Matrix</span>

## <span style="color: blue;"> double check:</span>
- <span style="color: blue;"> Does GridSearch use any variables that has been altered by the loops before?</span>

## keeping track of results

- First SVM (linear) with f1 score for the class "hate-speech" around 0.3 for "tweets" and most pre-preprocessed text. Lemmatization and stemming do not or only slightly improve results. 
-Testing out different kernels gives worse results for "rbf" and "poly". "sigmoid" produces similar results.
- --> !!! vectors not scaled --> bad performance of some of the models?
- using weighted classes gives f1 scores up to 0.62
- using GridSearch for the least processed column "tweets" gives f1 score of 0.59 (parameters C, kernel and class_weight); using more-preprocessing beforehand might improve results