import statements:

In [1]:
import Features_Modules as fm

In [2]:
import pandas as pd
import boto3
import os
from scipy.stats import spearmanr
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
# import seaborn as sns
from sklearn.linear_model import LinearRegression
from scipy.stats import kendalltau, pearsonr, spearmanr
import matplotlib.pyplot as plt
import seaborn as sns
import tempfile
import joblib
%matplotlib inline

Run the regression and get results (on training data)

In [3]:
df = fm.read_clean_data('clean_train.csv')
model_data, features, scaler = fm.pre_process_data(df, [(1,1)])

In [4]:
reg, spear, r2 = fm.fit_regression(model_data, njobs=10)
print("correlation with actual rank:", spear,"\nR^2 value for model:", r2)

correlation with actual rank: 0.3211504517929924 
R^2 value for model: 0.1720589076797865


Read in test data (from test/train split)

In [5]:
df_test = fm.read_clean_data('clean_test.csv')
# df_test.head()

In [6]:
# Get Count Vectors, length vector, and pH vector... then scale
test1 = fm.use_features(features, df_test)
test2 = fm.vector_seq_len(df_test)
test1['length'] = test2.n_aa
test1['pH'] = test2.pH
test1 = pd.DataFrame(scaler.fit_transform(test1), columns = test1.columns)



In [7]:
test1.head()

Unnamed: 0,a,c,d,e,f,g,h,i,k,l,...,p,q,r,s,t,v,w,y,length,pH
0,-0.541589,-0.477819,-0.332804,-0.282142,-0.574289,-0.628824,-0.009128,-0.233484,-0.180653,-0.714718,...,-0.44549,-0.338693,0.116298,0.755538,0.001252,-0.52445,-0.531144,-0.551847,-0.292645,-0.139748
1,-0.204847,0.211568,-0.263952,-0.66023,-0.261,2.115797,-0.330355,-0.488549,-0.439416,-0.671358,...,0.717687,1.414681,-0.770973,0.157001,-0.224684,-0.559135,1.772517,0.080897,0.009899,3.170314
2,-0.064537,-0.324622,-0.298378,-0.584613,0.05229,-0.09974,-0.571275,-0.275995,-0.504106,-0.064315,...,-0.412256,-0.229107,-0.153741,-0.291902,-0.375309,0.065208,-0.20205,-0.472754,-0.28602,-0.139748
3,-0.317094,-0.40122,0.045879,0.448828,-0.574289,-0.364282,-0.089435,-0.616081,1.210194,-0.649678,...,0.086248,-0.302164,1.736531,0.830355,0.001252,-0.316335,-0.860239,-0.947312,0.027565,-0.139748
4,-0.092599,-0.554417,-0.608209,-0.534201,-0.574289,-0.595756,-0.250048,-0.658592,-0.439416,-0.541277,...,-0.611658,-0.521336,-0.732396,-0.466475,-0.412965,-0.663192,-0.695692,-0.710033,-0.581939,-0.139748


In [8]:
prediction = reg.predict(test1)
print("correlation with actual rank:", fm.get_spearman(prediction, df_test.label)[0],
      "\nR^2 value for model:", reg.score(test1,df_test.label))

correlation with actual rank: 0.30596270588385877 
R^2 value for model: 0.15553483086371


Persist the model

In [9]:
bucket = 'tech-x-final-project'
MODEL_FOLDER = 'models'

In [10]:
my_region = boto3.session.Session().region_name
print(my_region)
s3 = boto3.client('s3')

us-east-1


In [11]:
key1 = "models/benchmark_model.pkl"
key2 = "models/benchmark_scaler.pkl"
key3 = "models/benchmark_features.pkl"
with tempfile.TemporaryFile() as fp:
    joblib.dump(reg, fp)
    fp.seek(0)
    s3.put_object(Key=key1, Body=fp.read(), Bucket=bucket)
    
with tempfile.TemporaryFile() as fp:
    joblib.dump(scaler, fp)
    fp.seek(0)
    s3.put_object(Key=key2, Body=fp.read(), Bucket=bucket)
    
with tempfile.TemporaryFile() as fp:
    joblib.dump(features, fp)
    fp.seek(0)
    s3.put_object(Key=key3, Body=fp.read(), Bucket=bucket)

Read in KAGGLE test set

In [12]:
DATASET = 'test.csv'
LABELS = 'test_labels.csv'
MAIN_FOLDER = 's3://tech-x-final-project'
DATA_FOLDER = 'raw-data'


test_data = pd.read_csv(os.path.join(MAIN_FOLDER,DATA_FOLDER,DATASET))
test_data = test_data.drop(['data_source'], axis=1)
print(len(test_data))
test_labels = pd.read_csv(os.path.join(MAIN_FOLDER,DATA_FOLDER,LABELS))

# Do this next step in case they aren't in the same order
test = pd.merge(test_data, test_labels, on='seq_id').drop('seq_id',axis=1)

labels = test.tm
test = test.drop('tm',axis=1)

2413


In [13]:
test.nunique()

protein_sequence    2413
pH                     1
dtype: int64

In [14]:
print(labels.nunique())
print(labels.shape)

307
(2413,)


Vectorize Test Data

In [15]:
# Get Count Vectors
test1 = fm.use_features(features, test)
test2 = fm.vector_seq_len(test)
test1['length'] = test2.n_aa
test1['pH'] = test2.pH

In [16]:
# scaler = StandardScaler()
test1 = pd.DataFrame(scaler.fit_transform(test1), columns = test1.columns)

In [22]:
# test1.head()

In [19]:
prediction = reg.predict(test1)
print("correlation with actual rank:", fm.get_spearman(prediction, labels)[0],
      "\nR^2 value for model:", reg.score(test1,labels))

correlation with actual rank: 0.06612291425496875 
R^2 value for model: -10.04231251412316
