In [None]:
import pandas as pd
import numpy as np

In [None]:
import re
import nltk
from nltk.tokenize import word_tokenize

**Load Data**

In [None]:
df=pd.read_csv('train_data.csv')

In [None]:
df.head()

Unnamed: 0,text,label
0,Here are Thursday's biggest analyst calls: App...,0
1,Buy Las Vegas Sands as travel to Singapore bui...,0
2,"Piper Sandler downgrades DocuSign to sell, cit...",0
3,"Analysts react to Tesla's latest earnings, bre...",0
4,Netflix and its peers are set for a ‘return to...,0


In [None]:
df.shape

(16990, 2)

In [None]:
df['label'].value_counts()

label
2     3545
18    2118
14    1822
9     1557
5      987
16     985
1      837
19     823
7      624
6      524
15     501
17     495
12     487
13     471
4      359
3      321
0      255
8      166
10      69
11      44
Name: count, dtype: int64

**Data Cleansing**

In [None]:
def cleansing(df):
    df_clean=df.str.lower()
    df_clean=[re.sub(r"\d+","",i )for i in df_clean]
    df_clean=[re.sub(r'[^\w]', ' ', i)for i in df_clean]
    df_clean=[re.sub(r'\s+',' ',i)for i in df_clean]
    return df_clean

In [None]:
df['clean_text']=cleansing(df['text'])

In [None]:
df['clean_text'].iloc[1]

'buy las vegas sands as travel to singapore builds wells fargo says https t co flswicz'

In [None]:
df.head()

Unnamed: 0,text,label,clean_text
0,Here are Thursday's biggest analyst calls: App...,0,here are thursday s biggest analyst calls appl...
1,Buy Las Vegas Sands as travel to Singapore bui...,0,buy las vegas sands as travel to singapore bui...
2,"Piper Sandler downgrades DocuSign to sell, cit...",0,piper sandler downgrades docusign to sell citi...
3,"Analysts react to Tesla's latest earnings, bre...",0,analysts react to tesla s latest earnings brea...
4,Netflix and its peers are set for a ‘return to...,0,netflix and its peers are set for a return to ...


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, log_loss
from sklearn.metrics import classification_report, confusion_matrix

**Split train test**

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df['clean_text'], df['label'], test_size = 0.2, random_state = 42,stratify=df['label'])

In [None]:
x_train.head()

5922     it s ecb rate decision day here s what to expe...
13498    twitter users were quick to spot liz truss see...
4517     jetblue announces webcast of second quarter ea...
16161     calm cal maine foods stock ticks higher on re...
1745     tower semiconductor and cadence expand collabo...
Name: clean_text, dtype: object

**Tokenize the words in the train and test data**

In [None]:
word_token_train = [word_tokenize(i) for i in x_train]
word_token_test = [word_tokenize(i) for i in x_test]

In [None]:
word_token_train[0:1]

[['it',
  's',
  'ecb',
  'rate',
  'decision',
  'day',
  'here',
  's',
  'what',
  'to',
  'expect',
  'via',
  'weberalexander',
  'amp',
  'carolynnlook',
  'https',
  't',
  'co',
  'isqgdue']]

**Remove Stop word**

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
# Get a list of stop words in the Indonesian language
stop_words = set(stopwords.words('english'))

# Display the top 20 stop words
list(stop_words)[:20]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/liliayu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['she',
 "it'll",
 'haven',
 "couldn't",
 'didn',
 'her',
 'itself',
 "hasn't",
 "you've",
 'him',
 "wouldn't",
 "should've",
 'yours',
 "he'd",
 'against',
 'some',
 'until',
 'do',
 'has',
 'weren']

In [None]:
# Remove stopwords from each sublist in word_token_train and test
filtered_tokens_train = [[word for word in sublist if word not in stop_words] for sublist in word_token_train]
filtered_tokens_test = [[word for word in sublist if word not in stop_words] for sublist in word_token_test]


In [None]:
filtered_tokens_train[0:2]

[['ecb',
  'rate',
  'decision',
  'day',
  'expect',
  'via',
  'weberalexander',
  'amp',
  'carolynnlook',
  'https',
  'co',
  'isqgdue'],
 ['twitter',
  'users',
  'quick',
  'spot',
  'liz',
  'truss',
  'seemingly',
  'recreating',
  'outfit',
  'margaret',
  'thatcher',
  'appearance',
  'channel',
  'tory',
  'leadership',
  'debate',
  'https',
  'co',
  'vsiioegrz']]

**Text representation and Featurev**

In [None]:
import gensim
from gensim.models import Word2Vec

# Skipgram
model_skipgram = gensim.models.Word2Vec(filtered_tokens_train, min_count = 3, vector_size = 50, window = 5, sg=1)

In [None]:
#average word2vec for each word
def text_to_vector(text):
    word_vectors = [model_skipgram.wv[word] for word in text if word in model_skipgram.wv]
    return np.mean(word_vectors, axis=1) if word_vectors else np.zeros(3)

In [None]:
# Transforming tokens in train and test into vectors using Skipgram model
x_train_vectors_skipgram = [text_to_vector(text) for text in filtered_tokens_train]
x_test_vectors_skipgram = [text_to_vector(text) for text in filtered_tokens_test]

In [None]:
x_train_vectors_skipgram=[arr.tolist() for arr in x_train_vectors_skipgram]
x_test_vectors_skipgram=[arr.tolist() for arr in x_test_vectors_skipgram]

In [None]:
#padding the token
maxlen= max(len(s) for s in filtered_tokens_train)
padded_train = [seq + [0] * (maxlen - len(seq)) for seq in x_train_vectors_skipgram]
padded_test=[seq + [0] * (maxlen - len(seq)) for seq in x_test_vectors_skipgram]

# Convert to a NumPy array if needed
padded_train = np.array(padded_train, dtype=np.float32)
padded_test = np.array(padded_test, dtype=np.float32)

print(padded_train)

[[ 0.07619906  0.14676858  0.06171244 ...  0.          0.
   0.        ]
 [ 0.02869745  0.02040112  0.02873552 ...  0.          0.
   0.        ]
 [ 0.01680895 -0.00568421  0.01774954 ...  0.          0.
   0.        ]
 ...
 [ 0.03252766  0.01561644  0.03082221 ...  0.          0.
   0.        ]
 [ 0.0200643   0.02712887  0.0388409  ...  0.          0.
   0.        ]
 [ 0.07619906  0.08587857  0.07619906 ...  0.          0.
   0.        ]]


In [None]:
from sklearn import svm
svm_class = svm.LinearSVC( random_state=42)
svm_class.fit(padded_train, y_train)

**Testing**

In [None]:
test_svm_class=svm_class.predict(padded_test)

In [None]:
print('\nClassification Report\n')
print(classification_report(y_test, test_svm_class, target_names=['0','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19']))


Classification Report

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        51
           1       0.00      0.00      0.00       167
           2       0.27      0.85      0.41       709
           3       0.00      0.00      0.00        64
           4       0.00      0.00      0.00        72
           5       0.47      0.18      0.26       198
           6       0.00      0.00      0.00       105
           7       0.25      0.09      0.13       125
           8       0.00      0.00      0.00        33
           9       0.22      0.01      0.02       311
          10       0.00      0.00      0.00        14
          11       0.00      0.00      0.00         9
          12       0.00      0.00      0.00        97
          13       0.00      0.00      0.00        94
          14       0.27      0.55      0.36       364
          15       0.00      0.00      0.00       100
          16       0.35      0.11      0.16       197
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
