# Practical 1.2 - Classification and Time Series

#### Group 8

In [1]:
# Libraries

import os
import pandas as pd


In [2]:
data_folder = './practical1_2_data/daily_sentiments/'

# join all files
all_files = [os.path.join(data_folder, file) for file in os.listdir(data_folder) if file.endswith('.csv')]

df_list = []
for file in all_files:
    # extract the date time in the file's name
    date_str = os.path.basename(file).split('_')[-1].split('.')[0]
    date = pd.to_datetime(date_str)
    
    # read csv
    df = pd.read_csv(file)
    df['date'] = date
    
    # append to the list
    df_list.append(df)

# concat the data
all_data = pd.concat(df_list, ignore_index=True)

# drop useless column
if 'Unnamed: 0' in all_data.columns:
    all_data = all_data.drop(columns=['Unnamed: 0'])

all_data.head()

Unnamed: 0,user_id,valence_intensity,fear_intensity,anger_intensity,happiness_intensity,sadness_intensity,sentiment,date
0,18450036,0.596,0.374,0.326,0.435,0.321,1,2021-03-18
1,72807797,0.474,0.419,0.384,0.302,0.405,-1,2021-03-18
2,808919345766289412,0.607,0.353,0.372,0.448,0.376,1,2021-03-18
3,612268746,0.614,0.349,0.389,0.502,0.328,1,2021-03-18
4,19050000,0.449,0.495,0.424667,0.240333,0.394667,-1,2021-03-18


In [3]:
# check nan value
print(all_data.isnull().sum())

user_id                0
valence_intensity      0
fear_intensity         0
anger_intensity        0
happiness_intensity    0
sadness_intensity      0
sentiment              0
date                   0
dtype: int64


In [4]:
# assume all features are numerical
numeric_columns = ['valence_intensity', 'fear_intensity', 'anger_intensity', 'happiness_intensity', 'sadness_intensity', 'sentiment']
all_data[numeric_columns] = all_data[numeric_columns].apply(pd.to_numeric)

print(all_data.head())


              user_id  valence_intensity  fear_intensity  anger_intensity  \
0            18450036              0.596           0.374         0.326000   
1            72807797              0.474           0.419         0.384000   
2  808919345766289412              0.607           0.353         0.372000   
3           612268746              0.614           0.349         0.389000   
4            19050000              0.449           0.495         0.424667   

   happiness_intensity  sadness_intensity  sentiment       date  
0             0.435000           0.321000          1 2021-03-18  
1             0.302000           0.405000         -1 2021-03-18  
2             0.448000           0.376000          1 2021-03-18  
3             0.502000           0.328000          1 2021-03-18  
4             0.240333           0.394667         -1 2021-03-18  


In [5]:
# get rolling mean
all_data['rolling_mean_valence'] = all_data.groupby('user_id')['valence_intensity'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())
all_data['rolling_mean_fear'] = all_data.groupby('user_id')['fear_intensity'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())
all_data['rolling_mean_anger'] = all_data.groupby('user_id')['anger_intensity'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())
all_data['rolling_mean_happiness'] = all_data.groupby('user_id')['happiness_intensity'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())
all_data['rolling_mean_sadness'] = all_data.groupby('user_id')['sadness_intensity'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())

print(all_data.head())

              user_id  valence_intensity  fear_intensity  anger_intensity  \
0            18450036              0.596           0.374         0.326000   
1            72807797              0.474           0.419         0.384000   
2  808919345766289412              0.607           0.353         0.372000   
3           612268746              0.614           0.349         0.389000   
4            19050000              0.449           0.495         0.424667   

   happiness_intensity  sadness_intensity  sentiment       date  \
0             0.435000           0.321000          1 2021-03-18   
1             0.302000           0.405000         -1 2021-03-18   
2             0.448000           0.376000          1 2021-03-18   
3             0.502000           0.328000          1 2021-03-18   
4             0.240333           0.394667         -1 2021-03-18   

   rolling_mean_valence  rolling_mean_fear  rolling_mean_anger  \
0                 0.596              0.374            0.326000   
1 

In [6]:
# sort by date
all_data = all_data.sort_values(by='date')

# get validation set
validation_data = all_data[all_data['date'] > all_data['date'].max() - pd.Timedelta(days=5)]

# rest for train set
training_data = all_data[all_data['date'] <= all_data['date'].max() - pd.Timedelta(days=5)]

print("Training data:")
print(training_data.tail())
print("Validation data:")
print(validation_data.head())

Training data:
                     user_id  valence_intensity  fear_intensity  \
1740156            333254038              0.357           0.544   
1740155  1300388039231901702              0.506           0.412   
1740154           1714358744              0.256           0.580   
1740160            150611056              0.484           0.442   
1740153            348183007              0.446           0.355   

         anger_intensity  happiness_intensity  sadness_intensity  sentiment  \
1740156            0.578                0.196              0.493         -1   
1740155            0.447                0.286              0.376          0   
1740154            0.464                0.178              0.698         -1   
1740160            0.476                0.332              0.423          0   
1740153            0.459                0.340              0.355         -1   

              date  rolling_mean_valence  rolling_mean_fear  \
1740156 2021-08-26              0.388000    

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

# separate features and predictor
features = ['valence_intensity', 'fear_intensity', 'anger_intensity', 'happiness_intensity', 'sadness_intensity',
            'rolling_mean_valence', 'rolling_mean_fear', 'rolling_mean_anger', 'rolling_mean_happiness', 'rolling_mean_sadness']
X_train = training_data[features]
y_train = training_data['sentiment']

X_valid = validation_data[features]
y_valid = validation_data['sentiment']

# train tree model
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# prediction on validation set
y_pred = clf.predict(X_valid)

# evaluation
print(classification_report(y_valid, y_pred))

              precision    recall  f1-score   support

          -1       0.97      0.04      0.08     18349
           0       0.19      1.00      0.32      5212
           1       1.00      0.16      0.27      4931

    accuracy                           0.24     28492
   macro avg       0.72      0.40      0.23     28492
weighted avg       0.84      0.24      0.16     28492



In [14]:
from sklearn.ensemble import RandomForestClassifier

# train rf clf
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)

# prediction
y_pred_rf = rf_clf.predict(X_valid)

# evaluation
print(classification_report(y_valid, y_pred_rf))

              precision    recall  f1-score   support

          -1       1.00      0.01      0.01     18349
           0       0.19      1.00      0.32      5212
           1       1.00      0.17      0.29      4931

    accuracy                           0.22     28492
   macro avg       0.73      0.39      0.21     28492
weighted avg       0.85      0.22      0.12     28492



## Part 2

In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

# define features and predictor
features = ['valence_intensity', 'fear_intensity', 'anger_intensity', 'happiness_intensity', 'sadness_intensity',
            'rolling_mean_valence', 'rolling_mean_fear', 'rolling_mean_anger', 'rolling_mean_happiness', 'rolling_mean_sadness']

# standardization
scaler = StandardScaler()
training_data[features] = scaler.fit_transform(training_data[features])

# regression 
regressors = {}
for feature in features:
    X_train = training_data[['date']].apply(lambda x: x.astype('int64') // 10**9).values.reshape(-1, 1)  # use the timestamps as feature
    y_train = training_data[feature]
    
    regressor = RandomForestRegressor(n_estimators=100, random_state=42)
    regressor.fit(X_train, y_train)
    regressors[feature] = regressor

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_data[features] = scaler.fit_transform(training_data[features])


In [10]:
# get all users' ID
user_ids = all_data['user_id'].unique()

generated_features_list = []

# generate the date of 5 days further
future_dates = pd.date_range(start=all_data['date'].max() + pd.Timedelta(days=1), periods=5)

# generate for every user
for date in future_dates:
    for user_id in user_ids:
        row = {'user_id': user_id, 'date': date}
        generated_features_list.append(row)

generated_features = pd.DataFrame(generated_features_list)

# predict the features
generated_features['date'] = generated_features['date'].apply(lambda x: int(x.timestamp()))
for feature in features:
    regressor = regressors[feature]
    X_future = generated_features[['date']].values.reshape(-1, 1)
    generated_features[feature] = regressor.predict(X_future)

# inverse transformer by standardization
generated_features[features] = scaler.inverse_transform(generated_features[features])

print(generated_features.head())

              user_id        date  valence_intensity  fear_intensity  \
0  965057311499628544  1630454400           0.439707        0.472883   
1           560094127  1630454400           0.439707        0.472883   
2  995661949655441408  1630454400           0.439707        0.472883   
3  958199801807437824  1630454400           0.439707        0.472883   
4           704369491  1630454400           0.439707        0.472883   

   anger_intensity  happiness_intensity  sadness_intensity  \
0         0.468737             0.273811           0.436184   
1         0.468737             0.273811           0.436184   
2         0.468737             0.273811           0.436184   
3         0.468737             0.273811           0.436184   
4         0.468737             0.273811           0.436184   

   rolling_mean_valence  rolling_mean_fear  rolling_mean_anger  \
0              0.446884           0.457906              0.4609   
1              0.446884           0.457906              0.4609

In [15]:
# predict emotion
X_generated = generated_features[features]
predicted_sentiments = rf_clf.predict(X_generated)

generated_features['predicted_sentiment'] = predicted_sentiments

print(generated_features.head())

              user_id        date  valence_intensity  fear_intensity  \
0  965057311499628544  1630454400           0.439707        0.472883   
1           560094127  1630454400           0.439707        0.472883   
2  995661949655441408  1630454400           0.439707        0.472883   
3  958199801807437824  1630454400           0.439707        0.472883   
4           704369491  1630454400           0.439707        0.472883   

   anger_intensity  happiness_intensity  sadness_intensity  \
0         0.468737             0.273811           0.436184   
1         0.468737             0.273811           0.436184   
2         0.468737             0.273811           0.436184   
3         0.468737             0.273811           0.436184   
4         0.468737             0.273811           0.436184   

   rolling_mean_valence  rolling_mean_fear  rolling_mean_anger  \
0              0.446884           0.457906              0.4609   
1              0.446884           0.457906              0.4609

In [16]:
# evaluation
print("evaluation of clf")
y_true = validation_data['sentiment']
y_pred = clf.predict(validation_data[features])

print(classification_report(y_true, y_pred))

分类模型性能评估：
              precision    recall  f1-score   support

          -1       0.97      0.04      0.08     18349
           0       0.19      1.00      0.32      5212
           1       1.00      0.16      0.27      4931

    accuracy                           0.24     28492
   macro avg       0.72      0.40      0.23     28492
weighted avg       0.84      0.24      0.16     28492

