In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,LancasterStemmer, WordNetLemmatizer

In [41]:
from datasets import load_dataset_builder

builder = load_dataset_builder("eloukas/edgar-corpus", "full")
print("Cache directory:", builder.cache_dir)

Cache directory: C:\Users\korez\.cache\huggingface\datasets/eloukas___edgar-corpus/full/1.0.0/c2f9ada1db31915d6af4cc19f0ad9486cd0bab93c5c26bb32850e5a1f74f2bd7


In [2]:
# import datasets

# # Load the entire dataset
# raw_dataset = datasets.load_dataset("eloukas/edgar-corpus", "full",trust_remote_code=True)

In [3]:
df = pd.read_csv('sentiment.csv')
df.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5842 entries, 0 to 5841
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Sentence   5842 non-null   object
 1   Sentiment  5842 non-null   object
dtypes: object(2)
memory usage: 91.4+ KB


In [5]:
df.duplicated().sum()

np.int64(6)

In [6]:
df.drop_duplicates(inplace=True)

In [7]:
df = df.rename(columns={
    'Sentiment': 'Class'
    }
)

In [8]:
df['Class'].value_counts()

Class
neutral     3124
positive    1852
negative     860
Name: count, dtype: int64

In [9]:
stop_words = set(stopwords.words('english'))

stemmer = PorterStemmer()

In [10]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'@\w+','',text)
    text = re.sub(r'http\S+','',text)
    text = re.sub(r'^\w\s','',text)
    text = re.sub(r'\s+',' ',text)
    text = re.sub(r'\d+','',text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.replace('Ã¢â‚¬Â¦', '')
    tokens = word_tokenize(text)
    filtered = [word for word in tokens if word not in stop_words]
    stemmed = [stemmer.stem(word) for word in filtered]
    return stemmed

In [11]:
df['Sentence'] = df['Sentence'].apply(preprocess)

In [12]:
df.head()

Unnamed: 0,Sentence,Class
0,"[geosolut, technolog, leverag, benefon, gp, so...",positive
1,"[esi, low, bk, real, possibl]",negative
2,"[last, quarter, componenta, net, sale, doubl, ...",positive
3,"[accord, finnishrussian, chamber, commerc, maj...",neutral
4,"[swedish, buyout, firm, sold, remain, percent,...",neutral


In [13]:
def _tf(dox):
    tf_dict = {}
    for term in dox:
        if term in tf_dict:
            tf_dict[term] += 1
        else:
            tf_dict[term] = 1
    
    for term in tf_dict:
        tf_dict[term] = tf_dict[term] / len(dox)
    return tf_dict

df['tf'] = df['Sentence'].apply(_tf)

In [14]:
def calc_df(tf):
    count = {}
    for dox in tf:
        for term in dox:
            if term in count:
                count[term] +=1
            else:
                count[term] = 1
    return count

In [15]:
dict_freq = calc_df(df['tf'])

In [16]:
dict_freq

{'geosolut': 2,
 'technolog': 114,
 'leverag': 3,
 'benefon': 9,
 'gp': 5,
 'solut': 137,
 'provid': 138,
 'locat': 38,
 'base': 85,
 'search': 6,
 'commun': 74,
 'platform': 13,
 'relev': 5,
 'multimedia': 3,
 'content': 23,
 'new': 261,
 'power': 46,
 'commerci': 32,
 'model': 43,
 'esi': 1,
 'low': 39,
 'bk': 2,
 'real': 30,
 'possibl': 14,
 'last': 105,
 'quarter': 278,
 'componenta': 22,
 'net': 462,
 'sale': 572,
 'doubl': 22,
 'eurm': 81,
 'period': 306,
 'year': 372,
 'earlier': 103,
 'move': 59,
 'zero': 1,
 'pretax': 51,
 'profit': 599,
 'loss': 219,
 'accord': 126,
 'finnishrussian': 1,
 'chamber': 2,
 'commerc': 4,
 'major': 54,
 'construct': 91,
 'compani': 860,
 'finland': 298,
 'oper': 603,
 'russia': 73,
 'swedish': 38,
 'buyout': 10,
 'firm': 35,
 'sold': 33,
 'remain': 59,
 'percent': 120,
 'stake': 63,
 'almost': 22,
 'eighteen': 1,
 'month': 106,
 'take': 64,
 'public': 27,
 'spi': 37,
 'wouldnt': 3,
 'surpris': 10,
 'see': 56,
 'green': 16,
 'close': 88,
 'shell': 

In [17]:
n_dox = len(df)

def idf(_n_dox, _df):
    idf_dict = {}
    for term in _df:
        idf_dict[term] = np.log10(_n_dox / (_df[term] + 1))
    return idf_dict

In [18]:
inverse_df = idf(n_dox,dict_freq)

In [19]:
def tfidf(tf):
    tfidif_dict  ={}
    for term in tf:
        tfidif_dict[term] = tf[term] * inverse_df[term]
    return tfidif_dict

df['tfidf'] = df['tf'].apply(tfidf)

In [20]:
df.head()

Unnamed: 0,Sentence,Class,tf,tfidf
0,"[geosolut, technolog, leverag, benefon, gp, so...",positive,"{'geosolut': 0.047619047619047616, 'technolog'...","{'geosolut': 0.15661876326198815, 'technolog':..."
1,"[esi, low, bk, real, possibl]",negative,"{'esi': 0.2, 'low': 0.2, 'bk': 0.2, 'real': 0....","{'esi': 0.6930170575114866, 'low': 0.432811058..."
2,"[last, quarter, componenta, net, sale, doubl, ...",positive,"{'last': 0.05555555555555555, 'quarter': 0.055...","{'last': 0.09671163433092465, 'quarter': 0.073..."
3,"[accord, finnishrussian, chamber, commerc, maj...",neutral,"{'accord': 0.1, 'finnishrussian': 0.1, 'chambe...","{'accord': 0.16623115622654572, 'finnishrussia..."
4,"[swedish, buyout, firm, sold, remain, percent,...",neutral,"{'swedish': 0.07142857142857142, 'buyout': 0.0...","{'swedish': 0.15536076258535106, 'buyout': 0.1..."


In [21]:
sorted_df = sorted(dict_freq.items(), key= lambda kv: kv[1], reverse=True)[:n_dox]
unique_term = [item[0] for item in sorted_df]

def tfidf_vectorizer(_tfidf):
    tfidf_vect = [0.0] * len(unique_term)
    
    for i,term in enumerate(unique_term):
        if term in _tfidf:
            tfidf_vect[i] = _tfidf[term]
    return tfidf_vect

In [22]:
df['vectorized'] = df['tfidf'].apply(tfidf_vectorizer)

In [23]:
vector = df['vectorized']

In [24]:
print("print first row matrix TF_IDF_Vec Series\n")
print(df['vectorized'][0])

print("\nmatrix size : ", len(df['vectorized'][0]))

print first row matrix TF_IDF_Vec Series

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, np.float64(0.06418161866198421), 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, np.float64(0.07729049918891995), np.float64(0.07743981889619893), 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, np.float64(0.1624207088445526), 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, np.float64(0.0872198491418022), 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, np.float64(0.09005019142046256), 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0

In [25]:
df['Class'].value_counts()

Class
neutral     3124
positive    1852
negative     860
Name: count, dtype: int64

In [26]:

binary_map = {
    'negative' : 0,
    'neutral' : 1,
    'positive' :2
}
df['Class'] = df['Class'].map(binary_map)

In [27]:
df.head()

Unnamed: 0,Sentence,Class,tf,tfidf,vectorized
0,"[geosolut, technolog, leverag, benefon, gp, so...",2,"{'geosolut': 0.047619047619047616, 'technolog'...","{'geosolut': 0.15661876326198815, 'technolog':...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,"[esi, low, bk, real, possibl]",0,"{'esi': 0.2, 'low': 0.2, 'bk': 0.2, 'real': 0....","{'esi': 0.6930170575114866, 'low': 0.432811058...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,"[last, quarter, componenta, net, sale, doubl, ...",2,"{'last': 0.05555555555555555, 'quarter': 0.055...","{'last': 0.09671163433092465, 'quarter': 0.073...","[0.0, 0.0, 0.0, 0.05488689071320946, 0.0559978..."
3,"[accord, finnishrussian, chamber, commerc, maj...",1,"{'accord': 0.1, 'finnishrussian': 0.1, 'chambe...","{'accord': 0.16623115622654572, 'finnishrussia...","[0.0, 0.08311121317677593, 0.09850783446002823..."
4,"[swedish, buyout, firm, sold, remain, percent,...",1,"{'swedish': 0.07142857142857142, 'buyout': 0.0...","{'swedish': 0.15536076258535106, 'buyout': 0.1...","[0.0, 0.05936515226912566, 0.0, 0.0, 0.0, 0.0,..."


In [28]:
x = df['vectorized']
y = df['Class']

In [29]:
def strat(x,y, train_size = 0.8):
    np.random.seed(42)
    
    yidx = {}
    for i,label in enumerate(y) :
        if label in yidx:
            yidx[label].append(i)
        else:
            yidx[label] = [i]
            
    tridx = []
    teidx = []
    
    for label,i in yidx.items():
        shuffle = np.random.permutation(i)
        train = max(1, int(len(shuffle) * train_size))
        tridx.extend(shuffle[:train])
        teidx.extend(shuffle[train:])
        
    xtr = x.iloc[tridx].reset_index(drop=True)
    xte = x.iloc[teidx].reset_index(drop=True)
    ytr = y.iloc[tridx].reset_index(drop=True)
    yte = y.iloc[teidx].reset_index(drop=True)
    
    return xtr,xte,ytr,yte

In [30]:
xtr,xte,ytr,yte = strat(x,y,train_size=0.8)

In [31]:
xtr = np.array([np.array(row, dtype=np.float32) for row in xtr]) # 
xte = np.array([np.array(row, dtype=np.float32) for row in xte]) # 
ytr = np.array(ytr, dtype=np.float32) # 
yte = np.array(yte, dtype=np.float32) # 

In [32]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(xtr, ytr)

In [33]:
print(xtr.shape)
print(xte.shape)
print(ytr.shape)
print(yte.shape)

(4668, 5836)
(1168, 5836)
(4668,)
(1168,)


In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
rf = RandomForestClassifier(random_state=42,n_estimators=100)

In [35]:
rf.fit(xtr,ytr)

ypredrf = rf.predict(xte)

In [36]:
print(f'acc = {accuracy_score(yte,ypredrf)}')
print(f'{classification_report(yte,ypredrf)}')

acc = 0.6532534246575342
              precision    recall  f1-score   support

         0.0       0.21      0.15      0.17       172
         1.0       0.66      0.82      0.73       625
         2.0       0.80      0.61      0.69       371

    accuracy                           0.65      1168
   macro avg       0.56      0.52      0.53      1168
weighted avg       0.64      0.65      0.64      1168



In [37]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import load_model

In [38]:
initializer = tf.keras.initializers.GlorotUniform(seed=42)
optimizer = Adam(learning_rate=0.0007)
model = keras.Sequential([
    layers.Input(shape=(xtr.shape[1],)),  # Input size matches TF-IDF output
    layers.Dense(512, activation='relu',kernel_initializer=initializer, kernel_regularizer=l2(0.0005)),# 
    layers.Dropout(0.7),
    layers.Dense(256, activation='relu',kernel_initializer=initializer, kernel_regularizer=l2(0.0005)),# 
    layers.Dropout(0.6),
    layers.Dense(128, activation='relu',kernel_initializer=initializer, kernel_regularizer=l2(0.0005)),# 
    layers.Dense(3, activation='softmax')  # 3 output classes
])

In [39]:
model.compile(
    optimizer=optimizer,
    loss='categorical_crossentropy',
    metrics=['accuracy'],
)