In [1]:
import pandas as pd
import sklearn

from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures

In [2]:
# Get train set and test set

# Comment out to switch dataset
df = pd.read_csv('../dataset/train_with_countries.csv', index_col=0)
# df = pd.read_csv('train_no_countries.csv', index_col=0)  

y_train = df['play_count'].astype(float)

df = df.drop(['play_count'], axis=1)
X_train = df.astype(float)

# Comment out to switch dataset
df_test = pd.read_csv('../dataset/test_with_countries.csv', index_col=0)
# df_test = pd.read_csv('../Dataset/test_no_countries.csv', index_col=0)

y_val = df_test['play_count'].astype(float)

df_test = df_test.drop(['play_count'], axis=1)
X_val = df_test.astype(float)

# Show the data
df_test.head()

Unnamed: 0,energy,liveness,tempo,speechiness,acousticness,instrumentalness,time_signature,danceability,KEY,duration_ms,...,country_UW,country_UY,country_VG,country_VN,country_VS,country_XM,country_YW,country_ZA,country_ZM,country_ZZ
0,0.0555,0.0379,109.452,0.0345,0.983,0.928,3.0,0.366,7.0,295133,...,0,0,0,0,0,0,0,0,0,0
1,0.139,0.182,131.116,0.0321,0.995,0.885,4.0,0.521,8.0,157333,...,0,0,0,0,0,0,0,0,0,0
2,0.164,0.106,92.34,0.0334,0.843,0.893,1.0,0.476,7.0,730666,...,0,0,0,0,0,0,0,0,0,0
3,0.376,0.0887,115.746,0.0576,0.875,0.859,3.0,0.533,2.0,493866,...,0,0,0,0,0,0,0,0,0,0
4,0.318,0.354,109.852,0.0458,0.537,0.658,4.0,0.594,1.0,532173,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# SGD Regressor
sgd_model = SGDRegressor()

# The pipeline
pipeline = Pipeline(steps = [
    ('mm',   MinMaxScaler()),
    ('poly', PolynomialFeatures(2)),
    ('sgd',  sgd_model)
])

# Set parameter
parameters = {'loss':['squared_loss', 'huber'], 'tol':[0.01, 0.001, 1e-4], 'penalty': ['l1', 'l2']}
clf = make_pipeline(MinMaxScaler(), 
                    PolynomialFeatures(2),
                    GridSearchCV(sgd_model, parameters, cv=5))
clf.fit(X_train, y_train)

# Train/predict using the whole pipeline
y_train_pred = clf.predict(X_train)
y_val_pred = clf.predict(X_val)

# Report the means_squared error on the test-set
print('-----------This is ten dataset with countries--------')
# print('-----------This is ten dataset without countries--------')
print(clf.best_estimator_)
print("train MAE: ", mean_absolute_error(y_train, y_train_pred))
print("test MAE: ", mean_absolute_error(y_val, y_val_pred))