In [29]:
import numpy as np
import pandas as pd
import re
import plotly.express as px
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [4]:
data = pd.read_excel('../.data/dataset.xlsx')

In [5]:
data.head()

Unnamed: 0,Text,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33322 entries, 0 to 33321
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Text       33322 non-null  object
 1   Sentiment  33322 non-null  object
dtypes: object(2)
memory usage: 520.8+ KB


## Data Cleaning

In [25]:
df = data.copy()

In [21]:
def clean_text(text):
    # Remove special characters
    text = text.replace('Ã¯Â¿Â½', '')
    
    # Lower casing
    text = text.lower()
    
    # Remove mentions and links
    text = re.sub(r'@[^\s]+', '', text)
    text = re.sub(r'http\S+|bit.ly\S+', '', text)

    return text

In [22]:
sample_text = "This statement is to    Check the Clean_Text function. A link https://whhgs.com and taken from @twitter"
print(f"Cleaned Text: {clean_text(sample_text)}")

Cleaned Text: this statement is to check the clean_text function. a link and taken from 


In [26]:
cleaned_data = df.copy()
cleaned_data['Text'] = cleaned_data['Text'].apply(clean_text)

In [24]:
cleaned_data.head()

Unnamed: 0,Text,Sentiment
0,the geosolutions technology will leverage bene...,positive
1,"$esi on lows, down $1.50 to $2.50 bk a real po...",negative
2,"for the last quarter of 2010 , componenta 's n...",positive
3,according to the finnish-russian chamber of co...,neutral
4,the swedish buyout firm has sold its remaining...,neutral


## Data Preprocessing

In [27]:
df = cleaned_data.copy()

In [30]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [54]:
def preprocess_text(text):
    # Remove punctuations
    text = re.sub(r"""[!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~]""", '', text)

    # Remove numbers
    text = re.sub(r'([0-9]+)', '', text)

    # Remove more than one space
    text = re.sub(r' +', ' ', text)
    # Trim the text
    text = text.strip()

    # Remove stopwords and tokenize then join
    text = [token for token in word_tokenize(text) if token not in stop_words]
    text = ' '.join(text)

    return text

In [55]:
sample_text = df['Text'][2]
sample_text

"for the last quarter of 2010 , componenta 's net sales doubled to eur131m from eur76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of eur7m ."

In [56]:
preprocess_text(sample_text)

'last quarter componenta net sales doubled eurm eurm period year earlier moved zero pretax profit pretax loss eurm'

In [57]:
preprocessed_data = df.copy()
preprocessed_data['Text'] = preprocessed_data['Text'].apply(preprocess_text)

In [58]:
preprocessed_data.head()

Unnamed: 0,Text,Sentiment
0,geosolutions technology leverage benefon gps s...,positive
1,esi lows bk real possibility,negative
2,last quarter componenta net sales doubled eurm...,positive
3,according finnishrussian chamber commerce majo...,neutral
4,swedish buyout firm sold remaining percent sta...,neutral


## Feature Engineering

In [59]:
df = preprocessed_data.copy()