import statements:

In [18]:
import Features_Modules as fm

In [19]:
import pandas as pd
import boto3
import os
from scipy.stats import spearmanr
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
# import seaborn as sns
from sklearn.linear_model import LinearRegression
from scipy.stats import kendalltau, pearsonr, spearmanr
import matplotlib.pyplot as plt
import seaborn as sns
import tempfile
import joblib
%matplotlib inline

Run the regression and get results (on training data)

In [None]:
df = fm.read_clean_data('clean_train.csv')
model_data, features, scaler = fm.pre_process_data(df, [(1,2)])

In [None]:
reg, spear, r2 = fm.fit_regression(model_data, njobs=10)
print("correlation with actual rank:", spear,"\nR^2 value for model:", r2)

Read in test data (from test/train split)

In [None]:
df_test = fm.read_clean_data('clean_test.csv')
# df_test.head()

In [None]:
# Get Count Vectors, length vector, and pH vector... then scale
test1 = fm.use_features(features, df_test)
test2 = fm.vector_seq_len(df_test)
test1['length'] = test2.n_aa
test1['pH'] = test2.pH
test1 = pd.DataFrame(scaler.fit_transform(test1), columns = test1.columns)



In [None]:
prediction = reg.predict(test1)
print("correlation with actual rank:", fm.get_spearman(prediction, df_test.label)[0],
      "\nR^2 value for model:", reg.score(test1,df_test.label))

Persist the model

In [None]:
bucket = 'tech-x-final-project'
MODEL_FOLDER = 'models'

In [None]:
my_region = boto3.session.Session().region_name
print(my_region)
s3 = boto3.client('s3')

In [None]:
key1 = "models/reg2_model.pkl"
key2 = "models/reg2_scaler.pkl"
key3 = "models/reg2_features.pkl"
with tempfile.TemporaryFile() as fp:
    joblib.dump(reg, fp)
    fp.seek(0)
    s3.put_object(Key=key1, Body=fp.read(), Bucket=bucket)
    
    joblib.dump(scaler, fp)
    fp.seek(0)
    s3.put_object(Key=key2, Body=fp.read(), Bucket=bucket)
    
    joblib.dump(features, fp)
    fp.seek(0)
    s3.put_object(Key=key3, Body=fp.read(), Bucket=bucket)

Read in KAGGLE test set

In [None]:
DATASET = 'test.csv'
LABELS = 'test_labels.csv'
MAIN_FOLDER = 's3://tech-x-final-project'
DATA_FOLDER = 'raw-data'


test_data = pd.read_csv(os.path.join(MAIN_FOLDER,DATA_FOLDER,DATASET))
test_data = test_data.drop(['data_source'], axis=1)
print(len(test_data))
test_labels = pd.read_csv(os.path.join(MAIN_FOLDER,DATA_FOLDER,LABELS))

# Do this next step in case they aren't in the same order
test = pd.merge(test_data, test_labels, on='seq_id').drop('seq_id',axis=1)

labels = test.tm
test = test.drop('tm',axis=1)

In [None]:
test.nunique()

In [None]:
print(labels.nunique())
print(labels.shape)

Vectorize Test Data

In [None]:
# Get Count Vectors
test1 = fm.use_features(features, test)
test2 = fm.vector_seq_len(test)
test1['length'] = test2.n_aa
test1['pH'] = test2.pH

In [None]:
# scaler = StandardScaler()
test1 = pd.DataFrame(scaler.fit_transform(test1), columns = test1.columns)

In [None]:
# test1.head()

In [None]:
prediction = reg.predict(test1)
print("correlation with actual rank:", fm.get_spearman(prediction, labels)[0],
      "\nR^2 value for model:", reg.score(test1,labels))