In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

from collections import Counter

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
seed = 9
np.random.seed(seed)

data = pd.read_csv("train_NIR5Yl1.csv")


In [3]:
data_to_use = data.drop(['ID','Tag','Username'],axis = 1)

# one_hot_tag = pd.get_dummies(data['Tag'])
# data_to_use = pd.concat([one_hot_tag,data_to_use], axis=1).drop(['a'], axis=1)


In [4]:
X = data_to_use.iloc[:,:-1]
Y = data_to_use.iloc[:,-1]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 99)

In [5]:
regressors = {}
regressors['gbr'] = GradientBoostingRegressor(n_estimators=128, learning_rate=0.1, max_depth=3, random_state=0)

In [6]:
# poly regression
estimators3 = []
estimators3.append(('standardize', StandardScaler(with_mean=False)))
estimators3.append(('regressor', PolynomialFeatures(degree=4)))
estimators3.append(('poly', LinearRegression()))
pipeline3 = Pipeline(estimators3)

# grid = GridSearchCV(pipeline, cv=3, n_jobs=12, param_grid=param_grid, scoring='mean_squared_error')
# grid.fit(X_train,Y_train)

# print(grid.best_score_)

pipeline3.fit(X_train, Y_train)

y_pred3 = pipeline3.predict(X_test)

print("Train RMSE:",np.sqrt(mean_squared_error(Y_train,pipeline3.predict(X_train))))
print("Test RMSE:",np.sqrt(mean_squared_error(Y_test,y_pred3)))


Train RMSE: 967.9888603288991
Test RMSE: 928.3141273058224


In [7]:
# gradient boosting
estimators2 = []
estimators2.append(('standardize', StandardScaler(with_mean=False)))
estimators2.append(('regressor', regressors['gbr']))
pipeline2 = Pipeline(estimators2)

# grid = GridSearchCV(pipeline, cv=3, n_jobs=12, param_grid=param_grid, scoring='mean_squared_error')
# grid.fit(X_train,Y_train)

# print(grid.best_score_)

pipeline2.fit(X_train, Y_train)

y_pred2 = pipeline2.predict(X_test)

print("Train RMSE:",np.sqrt(mean_squared_error(Y_train,pipeline2.predict(X_train))))
print("Test RMSE:",np.sqrt(mean_squared_error(Y_test,y_pred2)))


Train RMSE: 655.118181119385
Test RMSE: 1172.129748662129


In [8]:
ensemble = y_pred3 + y_pred2
ensemble = ensemble/2
print("Test RMSE:",np.sqrt(mean_squared_error(Y_test,ensemble)))

Test RMSE: 989.1304641172596


In [9]:
# # submission file generation
test_data = pd.read_csv("test_8i3B3FC.csv")
test_data_to_use = test_data.drop(['ID','Tag','Username'],axis = 1)
test_pred3 = pipeline3.predict(test_data_to_use)
test_pred2 = pipeline2.predict(test_data_to_use)
test_pred = (test_pred3+test_pred2)/2
final_res =  pd.concat([test_data['ID'],pd.DataFrame(test_pred,  columns=['Upvotes'])], axis=1)
final_res.to_csv('sub_poly.csv', index = False)