In [1]:
%pip install scikit-learn
%pip install xgboost
%pip install nltk
%pip install numpy
%pip install pandas
%pip install matplotlib
%pip install tesnorflow

Collecting scikit-learn
  Using cached scikit_learn-1.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.6 MB)
Collecting numpy>=1.17.3 (from scikit-learn)
  Downloading numpy-1.25.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting scipy>=1.3.2 (from scikit-learn)
  Using cached scipy-1.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
Collecting joblib>=1.1.1 (from scikit-learn)
  Using cached joblib-1.2.0-py3-none-any.whl (297 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Using cached threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Installing collected packages: threadpoolctl, numpy, joblib, scipy, scikit-learn
Successfully installed joblib-1.2.0 numpy-1.25.0 scikit-learn-1.2.2 scipy-1.10.1 threadpoolctl-3.1.0
Note: you may need to restart the kernel t

In [2]:
import nltk
import re
import sklearn.model_selection
import pandas as pd
import xgboost as xgb
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [3]:
nltk.download('vader_lexicon')
nltk.download('stopwords')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/ignatella/nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ignatella/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [41]:
data = pd.read_csv('trumptweets.csv')

In [42]:
stopwords = nltk.corpus.stopwords.words("english")

other_exclusions = ["#ff", "ff", "rt"]
stopwords.extend(other_exclusions)

## Data preprocessing

In [43]:
space_pattern = '\s+'
giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
                   '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
mention_regex = '@[\w\-]+'
data['content'] = data['content'].apply(lambda x: re.sub(space_pattern, ' ', x))
data['content'] = data['content'].apply(lambda x: re.sub(giant_url_regex, 'URL', x))
data['content'] = data['content'].apply(lambda x: re.sub(mention_regex, 'MENTION', x))

In [46]:
def make_sentiment(tweet):
    sentiment = SentimentIntensityAnalyzer()
    sentiment_scores = sentiment.polarity_scores(tweet)
    compound_score = sentiment_scores['compound']
    return compound_score
    # if compound_score >= 0.05:
    #     return 'positive'
    # elif compound_score <= -0.05:
    #     return 'negative'
    # else:
    #     return 'neutral'


data['sentiment'] = data['content'].apply(make_sentiment)
data.to_csv('sentiment_labels.csv', index=False)

In [47]:
data['sentiment'].value_counts()

sentiment
 0.0000    7247
 0.4404     835
 0.4199     672
 0.6588     596
 0.3612     475
           ... 
 0.4689       1
 0.2782       1
-0.8492       1
 0.0108       1
 0.4407       1
Name: count, Length: 5707, dtype: int64

In [48]:
data.describe()

Unnamed: 0,id,retweets,favorites,geo,sentiment
count,41122.0,41122.0,41122.0,0.0,41122.0
mean,6.088909e+17,5455.590657,22356.899105,,0.221528
std,3.027946e+17,10130.076661,41501.859711,,0.529607
min,1698309000.0,0.0,0.0,,-0.9864
25%,3.549428e+17,25.0,28.0,,0.0
50%,5.609149e+17,291.0,247.0,,0.3595
75%,7.941218e+17,8778.0,32970.75,,0.6705
max,1.219077e+18,309892.0,857678.0,,0.9899


## Tokenization

In [49]:
# join tweets back together
data['content'] = data['content'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords]))
data['content']

0        Be sure tune watch Donald Trump Late Night Dav...
1        Donald Trump appearing The View tomorrow morni...
2        Donald Trump reads Top Ten Financial Tips Late...
3        New Blog Post: Celebrity Apprentice Finale Les...
4        "My persona never wallflower - I’d rather buil...
                               ...                        
41117    I never seen Republican Party Strong Unified r...
41118    Now Mini Mike Bloomberg critical Jack Wilson, ...
41119    I thrilled back Great State Texas tonight, peo...
41120    “In House, President got less due process 9-11...
41121      A great show! Check tonight 9pm. @ FoxNewsURL …
Name: content, Length: 41122, dtype: object

In [50]:
data['content'][2]

'Donald Trump reads Top Ten Financial Tips Late Show David Letterman: URL - Very funny!'

Teraz prawdopodobnie potrzebujemy jeden wektor dla wszystkich zdań.

In [51]:
# data = pd.read_csv('sentiment_labels.csv')
# dic = {'positive': 1, 'negative': -1, 'neutral': 0}
# data['sentiment'] = data['sentiment'].map(dic)
count_vect = CountVectorizer(min_df=10)  # sprawdzałem w różne strony ~10 jest najlepsze

count_vect.fit(data['content'])

In [52]:
vec_train, vec_test, y_train, y_test = sklearn.model_selection.train_test_split(count_vect.transform(data['content']),
                                                                                data['sentiment'], test_size=0.3,
                                                                                random_state=0)

## Uczenie modelu

In [61]:
model = xgb.XGBRegressor()
model.fit(vec_train, y_train)
model.score(vec_test, y_test)

0.2625965278547492

0.7442680160440469

In [34]:
sentiments = ["Positive", "Neutral", "Negative"]
post = data['content'][2]
print(post)
X_new = count_vect.transform([post]).toarray()

y_pred = model.predict(X_new)
print(y_pred)

Donald Trump reads Top Ten Financial Tips on Late Show with David Letterman: URL - Very funny!
[-1.6161216]


In [38]:
# print(sentiments[np.argmax(y_pred)])

# predict the labels on validation dataset then print 2 coulumns: 1 predicted and one real

pred = model.predict(vec_test)


12337


In [39]:
dp = pd.DataFrame({'predicted': pred, 'real': y_test})
dp.describe()


Unnamed: 0,predicted,real
count,12337.0,12337.0
mean,0.325656,0.323498
std,0.682811,0.847477
min,-2.753288,-1.0
25%,-0.071253,0.0
50%,0.374715,1.0
75%,0.894108,1.0
max,2.595783,1.0


In [40]:
dp.head(100)


Unnamed: 0,predicted,real
32467,0.498823,1
37751,0.004928,1
33040,1.147345,1
23505,0.174590,0
29434,1.001938,1
...,...,...
38442,0.801742,1
28308,-0.824857,-1
10452,0.901519,1
11792,1.011760,1
