In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize

**Load Data**

In [3]:
df=pd.read_csv('train_data.csv')

In [4]:
df.head()

Unnamed: 0,text,label
0,Here are Thursday's biggest analyst calls: App...,0
1,Buy Las Vegas Sands as travel to Singapore bui...,0
2,"Piper Sandler downgrades DocuSign to sell, cit...",0
3,"Analysts react to Tesla's latest earnings, bre...",0
4,Netflix and its peers are set for a ‘return to...,0


In [5]:
df.shape

(16990, 2)

In [6]:
df['label'].value_counts()

label
2     3545
18    2118
14    1822
9     1557
5      987
16     985
1      837
19     823
7      624
6      524
15     501
17     495
12     487
13     471
4      359
3      321
0      255
8      166
10      69
11      44
Name: count, dtype: int64

**Text Cleansing**

In [7]:
def cleansing(df):
    df_clean=df.str.lower()
    df_clean=[re.sub(r"\d+","",i )for i in df_clean]
    df_clean=[re.sub(r'[^\w]', ' ', i)for i in df_clean]
    df_clean=[re.sub(r'\s+',' ',i)for i in df_clean]
    return df_clean

In [8]:
df['clean_text']=cleansing(df['text'])

In [9]:
df['clean_text'].iloc[1]

'buy las vegas sands as travel to singapore builds wells fargo says https t co flswicz'

In [10]:
df.head()

Unnamed: 0,text,label,clean_text
0,Here are Thursday's biggest analyst calls: App...,0,here are thursday s biggest analyst calls appl...
1,Buy Las Vegas Sands as travel to Singapore bui...,0,buy las vegas sands as travel to singapore bui...
2,"Piper Sandler downgrades DocuSign to sell, cit...",0,piper sandler downgrades docusign to sell citi...
3,"Analysts react to Tesla's latest earnings, bre...",0,analysts react to tesla s latest earnings brea...
4,Netflix and its peers are set for a ‘return to...,0,netflix and its peers are set for a return to ...


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, log_loss
from sklearn.metrics import classification_report, confusion_matrix

**Split Data into Train Test**

In [12]:
x_train, x_test, y_train, y_test = train_test_split(df['clean_text'], df['label'], test_size = 0.2, random_state = 42,stratify=df['label'])

In [13]:
x_train.head()

5922     it s ecb rate decision day here s what to expe...
13498    twitter users were quick to spot liz truss see...
4517     jetblue announces webcast of second quarter ea...
16161     calm cal maine foods stock ticks higher on re...
1745     tower semiconductor and cadence expand collabo...
Name: clean_text, dtype: object

**Text Representation**

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
vectorizer = TfidfVectorizer()
train_tfidf= vectorizer.fit_transform(x_train)


In [16]:
TFIDF_train=pd.DataFrame(train_tfidf.toarray(),columns=vectorizer.get_feature_names_out())
TFIDF_train.head()

Unnamed: 0,__gabriellacruz,_davidgoodman,_joshschafer,_srdash,aa,aaakyfpam,aabzqjst,aad,aadysqy,aaeucwux,...,zzprobvxb,zzrksvrp,zzugvmizx,zzwbhhx,zzwpkfa,zzxfxnjnxy,zzyfiep,zzyjeohkgg,zzzxrmiwfj,åkerström
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
train_tfidf.shape

(13592, 34539)

In [18]:
col=TFIDF_train.columns

In [19]:
col

Index(['__gabriellacruz', '_davidgoodman', '_joshschafer', '_srdash', 'aa',
       'aaakyfpam', 'aabzqjst', 'aad', 'aadysqy', 'aaeucwux',
       ...
       'zzprobvxb', 'zzrksvrp', 'zzugvmizx', 'zzwbhhx', 'zzwpkfa',
       'zzxfxnjnxy', 'zzyfiep', 'zzyjeohkgg', 'zzzxrmiwfj', 'åkerström'],
      dtype='object', length=34539)

**Remove Stopwords**

In [20]:
#stopword removal
from nltk.corpus import stopwords
nltk.download('stopwords')

list_stopwords = set(stopwords.words('english'))

nonstop_tokens = [word for word in col if not word in list_stopwords]

[nltk_data] Downloading package stopwords to /home/andrew/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [21]:
train_tfidf=TFIDF_train[nonstop_tokens ]

In [22]:
train_tfidf.shape

(13592, 34404)

**SVM for text classification**

In [23]:
from sklearn import svm
svm_class = svm.LinearSVC( random_state=42)
svm_class.fit(train_tfidf, y_train)

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


**Testing**

In [24]:
test_tfidf = vectorizer.transform(x_test)

In [25]:
TFIDF_test=pd.DataFrame(test_tfidf.toarray(),columns=vectorizer.get_feature_names_out())
TFIDF_test.head()

Unnamed: 0,__gabriellacruz,_davidgoodman,_joshschafer,_srdash,aa,aaakyfpam,aabzqjst,aad,aadysqy,aaeucwux,...,zzprobvxb,zzrksvrp,zzugvmizx,zzwbhhx,zzwpkfa,zzxfxnjnxy,zzyfiep,zzyjeohkgg,zzzxrmiwfj,åkerström
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
test_tfidf=TFIDF_test[nonstop_tokens]

In [27]:
test_svm_class=svm_class.predict(test_tfidf)

In [28]:
print('\nClassification Report\n')
print(classification_report(y_test, test_svm_class, target_names=['0','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19']))


Classification Report

              precision    recall  f1-score   support

           0       0.81      0.51      0.63        51
           1       0.86      0.84      0.85       167
           2       0.80      0.86      0.83       709
           3       0.82      0.78      0.80        64
           4       0.99      0.94      0.96        72
           5       0.92      0.96      0.94       198
           6       0.84      0.89      0.86       105
           7       0.85      0.82      0.83       125
           8       0.88      0.67      0.76        33
           9       0.75      0.70      0.73       311
          10       0.79      0.79      0.79        14
          11       1.00      0.78      0.88         9
          12       0.89      0.84      0.86        97
          13       0.86      0.59      0.70        94
          14       0.80      0.84      0.82       364
          15       0.85      0.72      0.78       100
          16       0.86      0.93      0.89       197
   