In [1]:
import pandas as pd
import numpy as np

In [2]:
import re
import nltk
from nltk.tokenize import word_tokenize

**Load Data**

In [3]:
df=pd.read_csv('train_data.csv')

In [4]:
df.head()

Unnamed: 0,text,label
0,Here are Thursday's biggest analyst calls: App...,0
1,Buy Las Vegas Sands as travel to Singapore bui...,0
2,"Piper Sandler downgrades DocuSign to sell, cit...",0
3,"Analysts react to Tesla's latest earnings, bre...",0
4,Netflix and its peers are set for a ‘return to...,0


In [5]:
df.shape

(16990, 2)

In [6]:
df['label'].value_counts()

label
2     3545
18    2118
14    1822
9     1557
5      987
16     985
1      837
19     823
7      624
6      524
15     501
17     495
12     487
13     471
4      359
3      321
0      255
8      166
10      69
11      44
Name: count, dtype: int64

**Data Cleansing**

In [7]:
def cleansing(df):
    df_clean=df.str.lower()
    df_clean=[re.sub(r"\d+","",i )for i in df_clean]
    df_clean=[re.sub(r'[^\w]', ' ', i)for i in df_clean]
    df_clean=[re.sub(r'\s+',' ',i)for i in df_clean]
    return df_clean

In [8]:
df['clean_text']=cleansing(df['text'])

In [9]:
df['clean_text'].iloc[1]

'buy las vegas sands as travel to singapore builds wells fargo says https t co flswicz'

In [10]:
df.head()

Unnamed: 0,text,label,clean_text
0,Here are Thursday's biggest analyst calls: App...,0,here are thursday s biggest analyst calls appl...
1,Buy Las Vegas Sands as travel to Singapore bui...,0,buy las vegas sands as travel to singapore bui...
2,"Piper Sandler downgrades DocuSign to sell, cit...",0,piper sandler downgrades docusign to sell citi...
3,"Analysts react to Tesla's latest earnings, bre...",0,analysts react to tesla s latest earnings brea...
4,Netflix and its peers are set for a ‘return to...,0,netflix and its peers are set for a return to ...


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, log_loss
from sklearn.metrics import classification_report, confusion_matrix

**Split train test**

In [12]:
x_train, x_test, y_train, y_test = train_test_split(df['clean_text'], df['label'], test_size = 0.2, random_state = 42,stratify=df['label'])

In [13]:
x_train.head()

5922     it s ecb rate decision day here s what to expe...
13498    twitter users were quick to spot liz truss see...
4517     jetblue announces webcast of second quarter ea...
16161     calm cal maine foods stock ticks higher on re...
1745     tower semiconductor and cadence expand collabo...
Name: clean_text, dtype: object

**Tokenize the words in the train and test data**

In [14]:
word_token_train = [word_tokenize(i) for i in x_train]
word_token_test = [word_tokenize(i) for i in x_test]

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/home/andrew/nltk_data'
    - '/home/andrew/miniconda3/envs/i_guess_thats_all-env/nltk_data'
    - '/home/andrew/miniconda3/envs/i_guess_thats_all-env/share/nltk_data'
    - '/home/andrew/miniconda3/envs/i_guess_thats_all-env/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
word_token_train[0:1]

[['it',
  's',
  'ecb',
  'rate',
  'decision',
  'day',
  'here',
  's',
  'what',
  'to',
  'expect',
  'via',
  'weberalexander',
  'amp',
  'carolynnlook',
  'https',
  't',
  'co',
  'isqgdue']]

**Remove Stop word**

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
# Get a list of stop words in the Indonesian language
stop_words = set(stopwords.words('english'))

# Display the top 20 stop words
list(stop_words)[:20]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/liliayu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['can',
 'as',
 'over',
 'will',
 'being',
 'your',
 'those',
 'himself',
 'or',
 'when',
 'nor',
 "hadn't",
 'theirs',
 'against',
 'my',
 'in',
 'haven',
 'from',
 'themselves',
 'while']

In [None]:
# Remove stopwords from each sublist in word_token_train and test
filtered_tokens_train = [[word for word in sublist if word not in stop_words] for sublist in word_token_train]
filtered_tokens_test = [[word for word in sublist if word not in stop_words] for sublist in word_token_test]


In [None]:
filtered_tokens_train[0:2]

[['ecb',
  'rate',
  'decision',
  'day',
  'expect',
  'via',
  'weberalexander',
  'amp',
  'carolynnlook',
  'https',
  'co',
  'isqgdue'],
 ['twitter',
  'users',
  'quick',
  'spot',
  'liz',
  'truss',
  'seemingly',
  'recreating',
  'outfit',
  'margaret',
  'thatcher',
  'appearance',
  'channel',
  'tory',
  'leadership',
  'debate',
  'https',
  'co',
  'vsiioegrz']]

**Text representation and Feature**

In [None]:
import gensim
from gensim.models import Word2Vec

# Skipgram
model_skipgram = gensim.models.Word2Vec(filtered_tokens_train, min_count = 3, vector_size = 50, window = 5, sg=1)

In [None]:
# Get word vectors
def get_word_vector(model, word):
    try:
        return model.wv.get_vector(word)
    except KeyError:
        # If the word is not in the model, return None
        return None

In [None]:
# Generate word vectors for filtered tokens using the specified word embedding model
def generate_word_vectors(filtered_tokens, word_vector_model, vector_length=50):
    # Initialize an empty list to store the resulting word vectors
    X_vectors = []

    # Iterate through each list of filtered tokens in the dataset
    for tokens in filtered_tokens:
        # Use the get_word_vector function to retrieve word vectors for each token
        vectorized_tokens = [get_word_vector(word_vector_model, word) for word in tokens]

         # Remove None values (words not present in the model) from the list of vectors
        vectorized_tokens = [vector for vector in vectorized_tokens if vector is not None]

        # If there are valid vectors, compute the average vector
        if vectorized_tokens:
            average_vector = sum(vectorized_tokens) / len(vectorized_tokens)
            X_vectors.append(average_vector)
        else:
            # If no valid vectors are present, use a zero vector as a placeholder
            X_vectors.append([0] * vector_length)

    # Return the list of resulting word vectors
    return X_vectors

In [None]:
# Transforming tokens in x_test into vectors using Skipgram model
x_train_vectors_skipgram = generate_word_vectors(filtered_tokens_train, model_skipgram)
x_test_vectors_skipgram = generate_word_vectors(filtered_tokens_test, model_skipgram)

In [None]:
x_train_vectors_skipgram[0:2]

[array([-0.21741273, -0.00356028,  0.09416561,  0.10089619,  0.0910439 ,
        -0.16837142,  0.60920864,  0.35092878, -0.21787873,  0.10309077,
         0.18494155, -0.36086994, -0.04488184, -0.05125307, -0.21484652,
        -0.22467075,  0.25970855,  0.00931355, -0.24847074, -0.10399978,
         0.25260985,  0.49241284,  0.35422885, -0.22224122,  0.22562155,
        -0.01508421,  0.15638095, -0.12423069, -0.21262318, -0.12599792,
         0.0417222 ,  0.38448933, -0.24292058,  0.21185866, -0.38258132,
         0.4510027 ,  0.15048984, -0.05780258, -0.16138749, -0.0950003 ,
         0.5000799 ,  0.07071552, -0.08694933,  0.05660988,  0.44913128,
        -0.01768802, -0.27912882, -0.39243996,  0.27753785,  0.42333868],
       dtype=float32),
 array([-0.09671163,  0.1697121 ,  0.07092498, -0.10292488, -0.05555817,
        -0.20847143,  0.4213042 ,  0.3746483 , -0.15316257, -0.02408643,
         0.2845458 , -0.2828379 , -0.26983035, -0.11992088, -0.1079844 ,
        -0.21045396,  0.029

**SVM for text classification**

In [None]:
from sklearn import svm
svm_class = svm.LinearSVC( random_state=42)
svm_class.fit(x_train_vectors_skipgram, y_train)

**Testing**

In [None]:
test_svm_class=svm_class.predict(x_test_vectors_skipgram)

In [None]:
print('\nClassification Report\n')
print(classification_report(y_test, test_svm_class, target_names=['0','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19']))


Classification Report

              precision    recall  f1-score   support

           0       0.50      0.10      0.16        51
           1       0.69      0.69      0.69       167
           2       0.57      0.79      0.66       709
           3       0.00      0.00      0.00        64
           4       0.97      0.89      0.93        72
           5       0.84      0.95      0.89       198
           6       0.68      0.74      0.71       105
           7       0.81      0.66      0.73       125
           8       0.71      0.30      0.43        33
           9       0.48      0.31      0.38       311
          10       0.00      0.00      0.00        14
          11       0.00      0.00      0.00         9
          12       0.69      0.49      0.57        97
          13       1.00      0.01      0.02        94
          14       0.56      0.71      0.63       364
          15       0.68      0.41      0.51       100
          16       0.70      0.80      0.75       197
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
