In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
import nltk
from collections import Counter
import pandas as pd
from nltk.corpus import stopwords
import string
import pandas as pd
df = pd.read_csv("Dataset/sentiment.csv", encoding="ISO-8859-1")

In [32]:
df

Unnamed: 0,Sentiment,News
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4844,negative,Net sales of the Paper segment decreased to EU...


In [33]:
# Split data
X = df['News']
y = df['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Compute class weights
classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))

# Create pipeline with Logistic Regression
model = make_pipeline(
    TfidfVectorizer(ngram_range=(1,2), max_df=0.95, min_df=5),
    LogisticRegression(class_weight=class_weights, max_iter=1000, solver='liblinear')
)

# Train
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(" Evaluation:\n")
print(classification_report(y_test, y_pred))

# Predict
frase = "Sales decreased"
sentiment = model.predict([frase])[0]
print("\n Example frase:", frase)
print(" Predicted Sentiment:", sentiment)



 Evaluation:

              precision    recall  f1-score   support

    negative       0.75      0.66      0.71       110
     neutral       0.79      0.91      0.85       571
    positive       0.79      0.59      0.67       289

    accuracy                           0.79       970
   macro avg       0.78      0.72      0.74       970
weighted avg       0.78      0.79      0.78       970


 Example frase: Sales decreased
 Predicted Sentiment: negative


In [34]:
df['text_length'] = df['News'].apply(lambda x: len(x.split()))
df.groupby('Sentiment')['text_length'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
negative,604.0,23.917219,9.887414,5.0,17.0,22.0,30.0,56.0
neutral,2879.0,22.17645,9.815893,2.0,15.0,21.0,28.0,81.0
positive,1363.0,24.69259,10.062878,5.0,17.0,23.0,31.0,57.0


In [36]:
# Assicurati che NLTK abbia le stopwords
nltk.download('stopwords')

# Prendi le stopwords in inglese
stop_words = set(stopwords.words('english'))

# Rimuovi la punteggiatura
punctuation = set(string.punctuation)

# Combina il testo delle frasi 'positive'
text_positive = ' '.join(df[df['Sentiment'] == 'positive']['News'])

# Split del testo in parole
words_positive = text_positive.split()

# Filtra le parole rimuovendo stopwords e punteggiatura
filtered_words_positive = [word.lower() for word in words_positive if word.lower() not in stop_words and word not in punctuation]

# Conta la frequenza di ciascuna parola
word_counts_positive = Counter(filtered_words_positive)

# Crea un DataFrame per visualizzare le parole più comuni nel sentiment positivo
word_freq_positive_df = pd.DataFrame(word_counts_positive.most_common(), columns=['Parola', 'Frequenza'])

# Mostra le prime 20 parole più comuni
print(word_freq_positive_df.head(30))


       Parola  Frequenza
0         eur        449
1          's        313
2          mn        241
3     company        240
4        said        230
5     finnish        198
6         net        196
7       sales        192
8      profit        191
9     million        170
10     period        139
11       year        139
12        mln        127
13  operating        122
14       2010        114
15         ``        109
16       2009        108
17    quarter        107
18        oyj         97
19         ''         97
20      group         96
21       rose         94
22  increased         89
23        new         82
24       2008         82
25   increase         76
26    finland         75
27       2007         73
28      first         73
29       loss         72


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
