In [153]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score
from scipy.stats import spearmanr, pearsonr

In [154]:
#Import Data
data = pd.read_csv("./Downloads/coffeedata.csv")
data.head()

Unnamed: 0,CupperPoints,ID,Country,HarvestYear,Variety,Process,Aroma,Flavor,Aftertaste,Acidity,...,MeanAlt,Address,Contact,Elevation,FullElevation,Precipitation,Temperature,Continent,NSHem,EWHem
0,10.0,263,Taiwan,2012,Unknown,Unknown,7.0,7.08,7.0,7.17,...,Unknown,10,10,1150.0,1150.0,2500.0,16.0,Asia,N,E
1,10.0,267,Taiwan,2012,Unknown,Unknown,7.25,7.25,7.0,7.08,...,200,10,10,1150.0,200.0,2500.0,16.0,Asia,N,E
2,10.0,277,Taiwan,2013,Unknown,Unknown,7.67,7.67,7.0,7.08,...,Unknown,10,10,1150.0,1150.0,2500.0,16.0,Asia,N,E
3,10.0,278,Taiwan,2012,Unknown,Unknown,7.58,7.58,7.17,7.17,...,Unknown,10,10,1150.0,1150.0,2500.0,16.0,Asia,N,E
4,9.25,1096,Guatemala,2010,Bourbon,Unknown,8.42,8.5,8.42,8.42,...,1700,2,2,759.0,1700.0,1996.0,23.45,North America,N,W


In [155]:
df = pd.DataFrame(data)

df['Process']=pd.Categorical(df['Process'])
dfProcessDummies = pd.get_dummies(df['Process'], prefix='Process')
df = pd.concat([df,dfProcessDummies],axis=1)

df['Color']=pd.Categorical(df['Color'])
dfColorDummies = pd.get_dummies(df['Process'],prefix='Color')
df = pd.concat([df,dfColorDummies],axis=1)

df['HarvestYear']=pd.Categorical(df['HarvestYear'])
dfHYDummies = pd.get_dummies(df['HarvestYear'],prefix='Year')
df = pd.concat([df,dfHYDummies],axis=1)

df['Variety']=pd.Categorical(df['Variety'])
dfVarDummies = pd.get_dummies(df['Variety'],prefix='Variety')
df = pd.concat([df,dfVarDummies],axis=1)

df['Continent']=pd.Categorical(df['Continent'])
dfContDummies = pd.get_dummies(df['Continent'],prefix='Cont')
df = pd.concat([df,dfContDummies],axis=1)

df['Contact']=pd.Categorical(df['Contact'])
dfContactDummies = pd.get_dummies(df['Contact'],prefix='Contact')
df = pd.concat([df,dfContactDummies],axis=1)

df.drop(['Country','HarvestYear','Variety','Process','Color','Continent','NSHem','EWHem',
         'Contact','Address','Elevation','ID','CupperPoints','MeanAlt'],
        axis=1,inplace=True)

cup = data['CupperPoints']

In [164]:
X_train, X_test, y_train, y_test = train_test_split(df, cup, train_size=0.85, random_state=42)
scaler = StandardScaler().fit(X_train)
X_train_scaled = pd.DataFrame(scaler.transform(X_train), index=X_train.index.values, columns=X_train.columns.values)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), index=X_test.index.values, columns=X_test.columns.values)

  return self.partial_fit(X, y)
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [165]:
pca = PCA()

pca.fit(X_train)
cpts = pd.DataFrame(pca.transform(X_train))
x_axis = np.arange(1, pca.n_components_+1)

pca_scaled = PCA()
pca_scaled.fit(X_train_scaled)
cpts_scaled = pd.DataFrame(pca.transform(X_train_scaled))

In [184]:
rf = RandomForestRegressor(n_estimators=200, oob_score=True, random_state=0)
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
           oob_score=True, random_state=0, verbose=0, warm_start=False)

In [185]:
predicted_train = rf.predict(X_train)
predicted_test = rf.predict(X_test.fillna(X_test.mean()))

test_score = r2_score(y_test, predicted_test)
spearman = spearmanr(y_test, predicted_test)
pearson = pearsonr(y_test, predicted_test)

In [191]:
print(str(round(test_score*100))+"% R2")

68.0% R2
