In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spacy
import tensorflow as tf

import re
import string

from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras import optimizers, layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Dropout, Flatten
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
train = pd.read_csv('twitter_training.csv', index_col=0, header=None, names=['entity', 'label', 'text'])
test = pd.read_csv('twitter_validation.csv', index_col=0, header=None, names=['entity', 'label', 'text'])

In [None]:
train

Unnamed: 0,entity,label,text
2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2401,Borderlands,Positive,im coming on borderlands and i will murder you...
2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...
9200,Nvidia,Positive,Just realized that the Windows partition of my...
9200,Nvidia,Positive,Just realized that my Mac window partition is ...
9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
9200,Nvidia,Positive,Just realized between the windows partition of...


In [None]:
test

Unnamed: 0,entity,label,text
3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4433,Google,Neutral,Now the President is slapping Americans in the...
...,...,...,...
4891,GrandTheftAuto(GTA),Irrelevant,⭐️ Toronto is the arts and culture capital of ...
4359,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...
8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.


spacy is a popular open-source library for natural language processing (NLP) in Python.

en_core_web_sm refers to the pre-trained English language model provided by spaCy. It is a small-sized model that includes vocabulary, word vectors, syntax, and named entity recognition (NER) capabilities.

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
def lowercase(data):
    return data['text'].str.lower()

def change_punctuation(data):
    return data['text'].str.replace('`', "'")

def remove_numbers(data):
    return data['text'].replace('[^a-zA-z.,!?/:;\"\'\s]', '', regex=True)

def remove_special_characters(data):
    return data['text'].replace('[^a-zA-Z0-9 ]', '', regex=True)

def custom(data):
    return data['text'].replace('im', 'i am')

def lemmatize(data):
    lemmatized_array = []

    for text in data['text']:
        lemmatized_text = []
        doc = nlp(text)
        for token in doc:
            lemmatized_text.append(token.lemma_)
        lemmatized_array.append(' '.join(lemmatized_text))
    return lemmatized_array

def stop_words(data):
    stop_words_array = []
    for text in data['text']:
        doc = nlp(text)
        filtered_tokens = [token.text for token in doc if not token.is_stop]
        stop_words_array.append(' '.join(filtered_tokens))
    return stop_words_array

def delete_links(data):
    return data['text'].replace(r'http\S+', '', regex=True)

def preprocessing(data):
    df = data.copy()
    df['text'] = lowercase(df)
    df['text'] = custom(df)
    df['text'] = change_punctuation(df)
    df['text'] = lemmatize(df)
    df['text'] = remove_numbers(df)
    df['text'] = delete_links(df)
    df['text'] = remove_special_characters(df)
    return df


In [None]:
train.drop_duplicates(subset=['text'], inplace=True)
train.reset_index(inplace=True)
train['text'] = train['text'].astype('str')
test['text'] = test['text'].astype('str')
train = preprocessing(train)
test = preprocessing(test)

In [None]:
le = LabelEncoder()
train['label'] = le.fit_transform(train['label'])
test['label'] = le.transform(test['label'])

X = train['text']
y = train['label']
max_words = 10000
maxlen = 200
emb_dim = 50
training_samples = int(len(X)*0.8)

text_dataset = tf.data.Dataset.from_tensor_slices(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Step 5: Train the machine learning model (Support Vector Machine)
model = SVC()
model.fit(X_train, y_train)

# Step 6: Make predictions on the testing set
y_pred = model.predict(X_test)

# Step 7: Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.89      0.92      2435
           1       0.93      0.96      0.95      4300
           2       0.94      0.93      0.93      3360
           3       0.92      0.94      0.93      3804

    accuracy                           0.94     13899
   macro avg       0.94      0.93      0.93     13899
weighted avg       0.94      0.94      0.93     13899

