<center><h1> Imports </h1></center>

In this section, we import all neccesary modules that will be used in the report

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import LinearSVR
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

<center><h1> Dataframe Creation </h1></center>

In this section, we create the dataframe, and any other neccesary global variables that will be used in the report

In [5]:
def import_samples():
    samples = pd.read_csv("combined_samples.csv").drop("Unnamed: 0", axis = 1)
    return samples

In [6]:
samples = import_samples()
samples

Unnamed: 0,Name,Artist,Acousticness,Danceability,Speechiness,Energy,Liveness,Valence,Key,Mode,Tempo,Duration,Popularity
0,All I Want for Christmas Is You,Mariah Carey,0.1640,0.335,0.0386,0.6250,0.0708,0.3460,7.0,1.0,150.277,241107.0,70.0
1,Jingle Bell Rock,Bobby Helms,0.6430,0.754,0.0363,0.4240,0.0652,0.8060,2.0,1.0,119.705,130973.0,61.0
2,Rockin' Around The Christmas Tree,Brenda Lee,0.6140,0.589,0.0502,0.4720,0.5050,0.8980,8.0,1.0,67.196,126267.0,62.0
3,A Holly Jolly Christmas - Single Version,Burl Ives,0.5790,0.683,0.0303,0.3750,0.0760,0.8880,0.0,1.0,140.467,135533.0,52.0
4,The Christmas Song (Merry Christmas To You),Nat King Cole,0.9200,0.319,0.0341,0.2100,0.1380,0.2090,1.0,1.0,78.696,192160.0,58.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,The Spelling Rules / My Favorite Moment of the...,25th Annual Putnam County Spelling Bee Origina...,0.8020,0.527,0.1550,0.2890,0.1060,0.4870,2.0,1.0,169.072,101947.0,28.0
1996,No Sé Vivir Sin Ti,Conjunto Primavera,0.0698,0.467,0.0305,0.5370,0.1450,0.5560,10.0,1.0,146.465,213253.0,20.0
1997,"Divertimento No. 11 in D Major, K. 251: III. A...",Wolfgang Amadeus Mozart,0.9200,0.279,0.0421,0.0727,0.1880,0.2030,9.0,1.0,130.693,224760.0,26.0
1998,Etude Op. 25 : No. 12 in C Minor,Frédéric Chopin,0.9860,0.183,0.0333,0.1730,0.1120,0.0736,5.0,0.0,75.121,166253.0,13.0


<center><h2> Creating Features and Target </h2></center>

In [7]:
def features_and_target(df):
    return (df.drop(columns = ["Name", "Artist", "Popularity"]), df["Popularity"])

In [8]:
features, target = features_and_target(samples)

In [9]:
features

Unnamed: 0,Acousticness,Danceability,Speechiness,Energy,Liveness,Valence,Key,Mode,Tempo,Duration
0,0.1640,0.335,0.0386,0.6250,0.0708,0.3460,7.0,1.0,150.277,241107.0
1,0.6430,0.754,0.0363,0.4240,0.0652,0.8060,2.0,1.0,119.705,130973.0
2,0.6140,0.589,0.0502,0.4720,0.5050,0.8980,8.0,1.0,67.196,126267.0
3,0.5790,0.683,0.0303,0.3750,0.0760,0.8880,0.0,1.0,140.467,135533.0
4,0.9200,0.319,0.0341,0.2100,0.1380,0.2090,1.0,1.0,78.696,192160.0
...,...,...,...,...,...,...,...,...,...,...
1995,0.8020,0.527,0.1550,0.2890,0.1060,0.4870,2.0,1.0,169.072,101947.0
1996,0.0698,0.467,0.0305,0.5370,0.1450,0.5560,10.0,1.0,146.465,213253.0
1997,0.9200,0.279,0.0421,0.0727,0.1880,0.2030,9.0,1.0,130.693,224760.0
1998,0.9860,0.183,0.0333,0.1730,0.1120,0.0736,5.0,0.0,75.121,166253.0


In [10]:
target

0       70.0
1       61.0
2       62.0
3       52.0
4       58.0
        ... 
1995    28.0
1996    20.0
1997    26.0
1998    13.0
1999    29.0
Name: Popularity, Length: 2000, dtype: float64

<center><h2> Splitting the Training and Testing Sets </h2></center>

In [11]:
def split_the_dataset():
    return train_test_split(features, target, random_state = 3000)

In [12]:
X_train, X_test, y_train, y_test = split_the_dataset()

<center><h2> Creating Estimator Dictionary </h2></center>

In [13]:
estimators = {"Linear Regression" : LinearRegression(),
              "Ridge" : Ridge(alpha = 1.0),
              "Lasso" : Lasso(alpha = 1.0),
              "k-Nearest Neighbor" : KNeighborsRegressor(n_neighbors = 5),
              "Support Vector Machine" : LinearSVR(C=1, max_iter = 1000000)}

<center><h1> Simple Regression Testing </h1></center>

In [14]:
def regressors_percentage_split():
    for estimator_name, model in estimators.items():
        model.fit(X=X_train, y=y_train)
        print(estimator_name + ":")
        print("\tR-squared value for training set: ", r2_score(y_train, model.predict(X_train)))
        print("\tR-squared value for testing set: ", r2_score(y_test, model.predict(X_test)))
        print("");

In [15]:
regressors_percentage_split()

Linear Regression:
	R-squared value for training set:  0.4032983281834135
	R-squared value for testing set:  0.4476074135539382

Ridge:
	R-squared value for training set:  0.40318953475678954
	R-squared value for testing set:  0.44745951722438304

Lasso:
	R-squared value for training set:  0.3017374732669593
	R-squared value for testing set:  0.322994310176522

k-Nearest Neighbor:
	R-squared value for training set:  0.5218446160519321
	R-squared value for testing set:  0.17409317888347886

Support Vector Machine:
	R-squared value for training set:  -0.7148122110003963
	R-squared value for testing set:  -0.653544506904441





<center><h1> Preprocessed Regression Testing </h1></center>

In [16]:
def preprocessed_regression():
    scaler = MinMaxScaler()
    
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    for estimator_name, model in estimators.items():
        
        model.fit(X=X_train_scaled, y=y_train)
        print(estimator_name + ":")
        print("\tR-squared value for training set: ", r2_score(y_train, model.predict(X_train_scaled)))
        print("\tR-squared value for testing set: ", r2_score(y_test, model.predict(X_test_scaled)))
        print("");
    return (X_train_scaled, X_test_scaled)

In [17]:
X_train_scaled, X_test_scaled = preprocessed_regression()

Linear Regression:
	R-squared value for training set:  0.4032983281834136
	R-squared value for testing set:  0.4476074135537863

Ridge:
	R-squared value for training set:  0.4031664072037111
	R-squared value for testing set:  0.4482070265540785

Lasso:
	R-squared value for training set:  0.30387367807841004
	R-squared value for testing set:  0.3363385208874664

k-Nearest Neighbor:
	R-squared value for training set:  0.6781297748525699
	R-squared value for testing set:  0.48202655173194275

Support Vector Machine:
	R-squared value for training set:  0.37359654925469676
	R-squared value for testing set:  0.4332253920292193



<center><h1> Feature Selected Regression Testing </h1></center>

---



In [18]:
def RFE_feature_selection():
    select = RFE(DecisionTreeRegressor(random_state = 3000), n_features_to_select = 5)
    select.fit(X_train_scaled, y_train)
    
    X_train_selected = select.transform(X_train_scaled)
    X_test_selected = select.transform(X_test_scaled)
    
    model = KNeighborsRegressor(n_neighbors = 5).fit(X=X_train_selected, y = y_train)
    features_mask = select.get_support()
    count = 0
    print("Selected features after RFE:")
    for feature_index in features_mask:
        if (feature_index):
            print("\t" + features.columns[count])
        count += 1
    print("kNN Regression performance with selected features:")
    print("\tR-squared value for training set: ", r2_score(y_train, model.predict(X_train_selected)))
    print("\tR-squared value for testing set: ", r2_score(y_test, model.predict(X_test_selected)))
    return (X_train_selected, X_test_selected)

In [19]:
X_train_selected, X_test_selected = RFE_feature_selection()

Selected features after RFE:
	Acousticness
	Danceability
	Speechiness
	Valence
	Duration
kNN Regression performance with selected features:
	R-squared value for training set:  0.6793504491664453
	R-squared value for testing set:  0.5333832959609355


<center><h1> Grid Search Regression Testing </h1></center>

In [20]:
param_grid = {"n_neighbors":[1, 5, 10], "metric": ["euclidean", "manhattan", "minkowski"]}

In [21]:
def grid_search_kNN():
    grid_search = GridSearchCV(KNeighborsRegressor(), param_grid, cv = 5)
    grid_search.fit(X = X_train_selected, y = y_train)
    
    print("Best parameters: ", grid_search.best_params_)
    print("Training set score with best parameters: ", grid_search.score(X_train_selected, y_train))
    print("Test set score with best parameters: ", grid_search.score(X_test_selected, y_test))

In [22]:
grid_search_kNN()

Best parameters:  {'metric': 'manhattan', 'n_neighbors': 5}
Training set score with best parameters:  0.6905581327004229
Test set score with best parameters:  0.5318469049500265
