In [1]:
import pandas as pd
import numpy as np
from math import sqrt

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [2]:
df = pd.read_csv('data/small_feature_vectors.csv')
y = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 0.5, 0.5, 0.5, 1.0, 1.0, 1.0, 1.0, 1.0])
X = df.drop(["#id"],  axis='columns')

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [4]:
scaler = StandardScaler()
train_scaled = scaler.fit_transform(X_train)
test_scaled = scaler.transform(X_test)

In [5]:
tree_model = DecisionTreeRegressor()
rf_model = RandomForestRegressor()

In [6]:
tree_model.fit(train_scaled, y_train)
rf_model.fit(train_scaled, y_train)

In [7]:
tree_mse = mean_squared_error(y_train, tree_model.predict(train_scaled))
tree_mae = mean_absolute_error(y_train, tree_model.predict(train_scaled))
rf_mse = mean_squared_error(y_train, rf_model.predict(train_scaled))
rf_mae = mean_absolute_error(y_train, rf_model.predict(train_scaled))

In [11]:
tree_model.predict(train_scaled)

array([0. , 0.5, 0.5, 0.5, 1. , 0. , 0. , 0.5, 1. , 0. , 1. , 1. ])

In [12]:
y_train

array([0. , 0.5, 0.5, 0.5, 1. , 0. , 0. , 0.5, 1. , 0. , 1. , 1. ])

In [8]:
print("Decision Tree training mse = ",tree_mse," & mae = ",tree_mae," & rmse = ", sqrt(tree_mse))
print("Random Forest training mse = ",rf_mse," & mae = ",rf_mae," & rmse = ", sqrt(rf_mse))

Decision Tree training mse =  0.0  & mae =  0.0  & rmse =  0.0
Random Forest training mse =  0.009708333333333334  & mae =  0.09000000000000001  & rmse =  0.09853087502571635


In [9]:
tree_test_mse = mean_squared_error(y_test, tree_model.predict(test_scaled))
tree_test_mae = mean_absolute_error(y_test, tree_model.predict(test_scaled))
rf_test_mse = mean_squared_error(y_test, rf_model.predict(test_scaled))
rf_test_mae = mean_absolute_error(y_test, rf_model.predict(test_scaled))

In [10]:
print("Decision Tree test mse = ",tree_test_mse," & mae = ",tree_test_mae," & rmse = ", sqrt(tree_test_mse))
print("Random Forest test mse = ",rf_test_mse," & mae = ",rf_test_mae," & rmse = ", sqrt(rf_test_mse))

Decision Tree test mse =  0.0  & mae =  0.0  & rmse =  0.0
Random Forest test mse =  0.028516666666666662  & mae =  0.15  & rmse =  0.16886878535320451


## All Data:

In [13]:
df_all = pd.read_csv('data/feature_df.csv')
y_all = np.array(df_all["#label"])
X_all = df_all.drop(["#id", "#label"],  axis='columns')

In [14]:
df_all.head()

Unnamed: 0,#label,#id,average_sentence_length_in_token,average_characters_per_word,average_syllables_per_word,text_length_in_token,average_number_of_noun_phrases_per_sentence,average_heights,average_number_of_subordinate_clauses_per_sentence,average_count_of_sentences_with_verb_as_root,...,POS_tag_proportion_for_numerales,POS_tag_proportion_for_adpositions,ttr,lexical_complexity_score,average_number_of_verbs_in_sentence,average_count_of_pronouns_per_sentence,average_count_of_definite_articles_per_sentence,average_semantic_similarity_of_all_nouns,average_semantic_similarity_of_all_verbs,average_semantic_similarity_of_all_adjectives\r\n
0,0.0,miniklexi_0,7.285714,4.862745,1.627451,102,2.214286,3.857143,0.071429,0.357143,...,0.176471,0.137255,0.683168,7.406711,1.285714,0.642857,0.142857,0.164365,0.339265,0.329539
1,0.0,miniklexi_1,8.090909,4.494382,1.539326,89,2.454545,4.363636,0.0,0.363636,...,0.157303,0.146067,0.590909,7.652426,0.909091,0.090909,1.454545,0.207224,0.405196,0.241409
2,0.0,miniklexi_2,7.1875,4.521739,1.634783,115,2.75,3.5,0.3125,0.6875,...,0.208696,0.113043,0.517544,8.269501,0.875,0.375,1.3125,0.257176,0.401604,0.011698
3,0.0,miniklexi_3,7.2,5.0,1.743056,144,2.0,3.85,0.15,0.35,...,0.208333,0.090278,0.531469,7.80238,1.0,0.6,0.6,0.249245,0.449268,0.249822
4,0.0,miniklexi_4,6.235294,5.132075,1.877358,106,2.176471,3.470588,0.117647,0.470588,...,0.188679,0.122642,0.704762,7.951559,0.882353,0.529412,0.647059,0.161453,0.386161,0.202404


In [23]:
X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(X_all, y_all, test_size=0.2, random_state=123)

In [24]:
all_scaler = StandardScaler()
all_train_scaled = all_scaler.fit_transform(X_train_all)
all_test_scaled = all_scaler.transform(X_test_all)

In [25]:
all_tree_model = DecisionTreeRegressor()
all_rf_model = RandomForestRegressor()

In [26]:
all_tree_model.fit(all_train_scaled, y_train_all)
all_rf_model.fit(all_train_scaled, y_train_all)

In [27]:
all_tree_mse = mean_squared_error(y_train_all, all_tree_model.predict(all_train_scaled))
all_tree_mae = mean_absolute_error(y_train_all, all_tree_model.predict(all_train_scaled))
all_rf_mse = mean_squared_error(y_train_all, all_rf_model.predict(all_train_scaled))
all_rf_mae = mean_absolute_error(y_train_all, all_rf_model.predict(all_train_scaled))

In [29]:
#all_tree_model.predict(all_train_scaled)

In [30]:
print("Decision Tree training mse = ",all_tree_mse," & mae = ",all_tree_mae," & rmse = ", sqrt(all_tree_mse))
print("Random Forest training mse = ",all_rf_mse," & mae = ",all_rf_mae," & rmse = ", sqrt(all_rf_mse))

Decision Tree training mse =  0.0  & mae =  0.0  & rmse =  0.0
Random Forest training mse =  0.0015783898305084748  & mae =  0.015593220338983055  & rmse =  0.03972895456098077


In [31]:
all_tree_test_mse = mean_squared_error(y_test_all, all_tree_model.predict(all_test_scaled))
all_tree_test_mae = mean_absolute_error(y_test_all, all_tree_model.predict(all_test_scaled))
all_rf_test_mse = mean_squared_error(y_test_all, all_rf_model.predict(all_test_scaled))
all_rf_test_mae = mean_absolute_error(y_test_all, all_rf_model.predict(all_test_scaled))

In [32]:
print("Decision Tree test mse = ",all_tree_test_mse," & mae = ",all_tree_test_mae," & rmse = ", sqrt(all_tree_test_mse))
print("Random Forest test mse = ",all_rf_test_mse," & mae = ",all_rf_test_mae," & rmse = ", sqrt(all_rf_test_mse))

Decision Tree test mse =  0.01694915254237288  & mae =  0.02824858757062147  & rmse =  0.13018891098082386
Random Forest test mse =  0.009920903954802262  & mae =  0.041186440677966105  & rmse =  0.0996037346428449
