# Classifing hate speech in tweets using Support Vector Machine 

In [None]:
import pandas as pd
import requests
import io
import sklearn.model_selection as ms
from sklearn import svm
from sklearn.metrics import classification_report, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV


In [2]:
# Downloading the csv file from GitHub
url = "https://raw.githubusercontent.com/Aaron9812/Data_mining/main/data/220505_train_data_preprocessed.csv"
download = requests.get(url).content

# Reading the downloaded content and turning it into a pandas dataframe
df = pd.read_csv(io.StringIO(download.decode('utf-8')), sep=";")

# Printing out the first row of the dataframe
print(df.columns)

Index(['id', 'label', 'tweet', 'n_mentions', 'hashtags', 'without_puctioation',
       'tweet_lower', 'tweet_token', 'clean_token', 'clean_hashtags',
       'stemmed_tokens', 'stemmed_hashtags', 'lemmatized_tokens',
       'lemmatized_hashtags', 'tfidf_stemmed_tokens', 'tfidf_stemmed_hashtags',
       'tfidf_lemmatized_tokens', 'tfidf_lemmatized_hashtags'],
      dtype='object')


In [3]:
features = [
"tweet",
"hashtags",
"without_punctuation",
"tweet_lower",
"tweet_token",
"clean_token",
"clean_hashtags",
"stemmed_tokens",
"stemmed_hashtags",
"lemmatized_tokens",
"lemmatized_hashtags"
]

### only text features used (so far); no numerical features included!!

In [4]:
X = df[features]
y = df.label
X.head()


Unnamed: 0,tweet,hashtags,without_puctioation,tweet_lower,tweet_token,clean_token,clean_hashtags,stemmed_tokens,stemmed_hashtags,lemmatized_tokens,lemmatized_hashtags
0,keisha grey and robin sadie exposes her mature...,"['robin', 'keisha', 'grey', 'mature']",keisha grey and robin sadie exposes her mature...,keisha grey and robin sadie exposes her mature...,"['grey', 'robin', 'exposes', 'and', 'on', 'rob...","['grey', 'robin', 'exposes', 'robin', 'grey', ...","['robin', 'keisha', 'grey', 'mature']","['grey', 'robin', 'expos', 'robin', 'grey', 'm...","['robin', 'keisha', 'grey', 'matur']","['grey', 'robin', 'expose', 'robin', 'grey', '...","['robin', 'keisha', 'grey', 'mature']"
1,ð· themeowood: puppy,[],ð· themeowood puppy,ð· themeowood puppy,['themeowood'],['themeowood'],[],['themeowood'],[],['themeowood'],[]
2,#angels #archangels from on high here to help ...,"['angels', 'archangels', 'spiritual']",angels archangels from on high here to help yo...,angels archangels from on high here to help yo...,"['archangels', 'on', 'here', 'help', 'live', '...","['archangels', 'help', 'live', 'lives']","['angels', 'archangels', 'spiritual']","['archangel', 'help', 'live', 'live']","['angel', 'archangel', 'spiritu']","['archangel', 'help', 'live', 'life']","['angel', 'archangel', 'spiritual']"
3,i am thankful for being able to remember. #tha...,"['thankful', 'positive']",i am thankful for being able to remember thank...,i am thankful for being able to remember thank...,"['am', 'for', 'able', 'remember', 'positive']","['able', 'remember', 'positive']","['thankful', 'positive']","['abl', 'rememb', 'posit']","['thank', 'posit']","['able', 'remember', 'positive']","['thankful', 'positive']"
4,rip anton yelchin. a young good actor from the...,"['ripantonyelchin', 'actor', 'antonyelchin']",rip anton yelchin a young good actor from the ...,rip anton yelchin a young good actor from the ...,"['anton', 'a', 'from', 'star', 'reboots', 'a',...","['anton', 'star', 'reboots', 'good', 'kind', '...","['ripantonyelchin', 'actor', 'antonyelchin']","['anton', 'star', 'reboot', 'good', 'kind', 'r...","['ripantonyelchin', 'actor', 'antonyelchin']","['anton', 'star', 'reboots', 'good', 'kind', '...","['ripantonyelchin', 'actor', 'antonyelchin']"


In [5]:
# Source used:
# https://medium.com/@vasista/sentiment-analysis-using-svm-338d418e3ff1
'''
Train-Test-Split (source vs. code here)

trainData --> X_train
testData --> X_test
trainData['Label'] --> y_train
testData['Label'] --> y_test
'''

"\nTrain-Test-Split (source vs. code here)\n\ntrainData --> X_train\ntestData --> X_test\ntrainData['Label'] --> y_train\ntestData['Label'] --> y_test\n"

In [6]:
(X_train, X_test, y_train, y_test) = ms.train_test_split(X, y, test_size=0.2, random_state = 17, stratify=y)

In [7]:
print("# rows dataset: ", len(df))
print("X_train:", len(X_train),  "   X_test:", len(X_test), "   y_train:",  len(y_train), "   y_test:", len(y_test))
print("X_train + X_test =", (len(X_train) + len(X_test)))

# rows dataset:  22372
X_train: 17897    X_test: 4475    y_train: 17897    y_test: 4475
X_train + X_test = 22372


In [8]:
# Create feature vectors
vectorizer = TfidfVectorizer(sublinear_tf = True)

# C-Support Vector Classification

Using each column indiviudally from our pre-processed data in order to find best kernel for the SVC.

In [27]:
kernels = ["linear", "rbf", "poly", "sigmoid"]

In [28]:
feature_selection = [
"tweet",
"tweet_lower",
]

In [None]:
for kernel in kernels:
    print("f1 scores with ", kernel, "kernel \n")
    for feature in feature_selection:
        vectors_train = vectorizer.fit_transform(X_train[feature])
        vectors_test = vectorizer.transform(X_test[feature])
        # Perform classification with SVM
        classifier = svm.SVC(kernel= kernel)
        classifier.fit(vectors_train, y_train)

        prediction = classifier.predict(vectors_test)

        # results
        """
        # alternative: print reports
        report = classification_report(y_test, prediction, output_dict=True)
        # print(feature,": ", report['1'])
        """
        f1 = f1_score(y_test, prediction)
        print(feature,": ", f1)
    print("____________________________________________ \n")

### SCV with balanced class weights

In [None]:
for kernel in kernels:
    print("f1 scores with ", kernel, "kernel \n")
    for feature in feature_selection:
        vectors_train = vectorizer.fit_transform(X_train[feature])
        vectors_test = vectorizer.transform(X_test[feature])
        # Perform classification with SVM
        classifier = svm.SVC(kernel= kernel, class_weight='balanced')
        classifier.fit(vectors_train, y_train)

        prediction = classifier.predict(vectors_test)

        # results
        """
        # alternative: print reports
        report = classification_report(y_test, prediction, output_dict=True)
        # print(feature,": ", report['1'])
        """
        f1 = f1_score(y_test, prediction)
        print(feature,": ", f1)
    print("____________________________________________ \n")

f1 scores with  linear kernel 

tweet :  0.7149606299212599
tweet_lower :  0.7169811320754716
____________________________________________ 

f1 scores with  rbf kernel 

tweet :  0.6653386454183267
tweet_lower :  0.6666666666666667
____________________________________________ 

f1 scores with  poly kernel 

tweet :  0.40909090909090906


### SCV with scaled vectors & balanced class weights

In [23]:
from sklearn.preprocessing import StandardScaler
vectorizer = TfidfVectorizer()
scaler = StandardScaler(with_mean=False)

vec_train = vectorizer.fit_transform(X_train["tweet"])
vec_test = vectorizer.transform(X_test["tweet"])
vectors_train = scaler.fit(vec_train)
vectors_test = scaler.fit(vec_test)

vectors_train = scaler.transform(vec_train)
vectors_test = scaler.transform(vec_test)


In [None]:
for kernel in kernels:
    print("f1 scores with ", kernel, "kernel \n")
    for feature in feature_selection:
        vectors_train = vectorizer.fit_transform(X_train[feature])
        vectors_test = vectorizer.transform(X_test[feature])
        # Perform classification with SVM
        classifier = svm.SVC(kernel= kernel)
        classifier.fit(vectors_train, y_train)

        prediction = classifier.predict(vectors_test)

        # results
        f1 = f1_score(y_test, prediction)
        print(feature,": ", f1)
    print("____________________________________________ \n")

## Using Grid Search for "tweet" with linear kernel

In [12]:
vectorizer = TfidfVectorizer()
scaler = StandardScaler(with_mean=False)

vec_train = vectorizer.fit_transform(X_train["tweet"])
vec_test = vectorizer.transform(X_test["tweet"])
vectors_train = scaler.fit(vec_train)
vectors_test = scaler.fit(vec_test)

X_train_grid = scaler.transform(vec_train)
X_test_grid = scaler.transform(vec_test)


In [25]:
# Perform classification with SVM
classifier = svm.SVC(kernel= "linear", class_weight='balanced')
classifier.fit(vectors_train, y_train)

prediction = classifier.predict(vectors_test)

# results
"""
# alternative: print reports
report = classification_report(y_test, prediction, output_dict=True)
# print(feature,": ", report['1'])
"""
f1 = f1_score(y_test, prediction)
print(feature,": ", f1)

tweet :  0.5153664302600472


In [None]:
svm_estimator = svm.SVC()

model = GridSearchCV(estimator=svm_estimator,
             param_grid={'C': [1, 10, ], 'kernel': ("linear", "rbf", "poly", "sigmoid"), 'class_weight': (None, "balanced")})
model.fit(X_train_grid, y_train)


In [None]:
f1_score(y_test, model.predict(X_test_grid))

## <span style="color: blue;"> to do:</span>

- <span style="color: blue;"> Why best result here worse than "tweet", with linear kernel?</span>
- <span style="color: blue;"> Finding out which parameters were most successful</span>
- <span style="color: blue;"> add Confusion Matrix</span>


## keeping track of results

- First SVM (linear) with f1 score for the class "hate-speech" around 0.3 for "tweets" and most pre-preprocessed text. Lemmatization and stemming do not or only slightly improve results. 
-Testing out different kernels gives worse results for "rbf" and "poly". "sigmoid" produces similar results.
- --> !!! vectors not scaled --> bad performance of some of the models?
- using weighted classes gives f1 scores up to 0.62
- using GridSearch for the least processed column "tweets" gives f1 score of 0.59 (parameters C, kernel and class_weight); using more-preprocessing beforehand might improve results
- meaning of parameters:
    - gamma:tries to fit non-linear data (maybe only useful for some of the "features"?
    - C:
    - weight-class:
- scaling parameters: f1 down to 50% (for "tweet")