In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
all_drugs = pd.read_csv('./data/all_drugs.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
all_drugs.shape

(126977, 30)

In [4]:
all_drugs.columns

Index(['key_0', 'NDC Description', 'NDC', 'NADAC_Per_Unit', 'Effective_Date',
       'Pricing_Unit', 'Pharmacy_Type_Indicator', 'OTC', 'Explanation_Code',
       'Classification_for_Rate_Setting',
       'Corresponding_Generic_Drug_NADAC_Per_Unit',
       'Corresponding_Generic_Drug_Effective_Date', 'As of Date', 'Unnamed: 0',
       'Ingredient', 'DF;Route', 'Trade_Name', 'Applicant', 'Strength',
       'Appl_Type', 'Appl_No', 'Product_No', 'TE_Code', 'Approval_Date', 'RLD',
       'RS', 'Type', 'Applicant_Full_Name', 'dosage_form', 'route'],
      dtype='object')

In [3]:
all_drugs.drop(columns=['Corresponding_Generic_Drug_Effective_Date', 'Unnamed: 0', 'DF;Route'], inplace=True)

In [4]:
features =  all_drugs[['NDC Description','Pharmacy_Type_Indicator', 'Classification_for_Rate_Setting', 'dosage_form', 'route',
           'Ingredient', 'Type']].copy()
feature_dummies = pd.get_dummies(features, columns = ['NDC Description','Pharmacy_Type_Indicator', 'Classification_for_Rate_Setting', 'dosage_form', 'route',
           'Ingredient', 'Type'], drop_first=True)
X = feature_dummies
y = all_drugs['NADAC_Per_Unit']

In [7]:
feature_dummies

Unnamed: 0,NDC Description_ACETAMINOPHEN 325MG TABLET,NDC Description_ACETAMINOPHEN ER 650MG TABLET,NDC Description_ACETAZOLAMIDE 125MG TABLET,NDC Description_ACETAZOLAMIDE 250MG TABLET,NDC Description_ACTIVELLA 1MG-0.5MG TABLET,NDC Description_ACYCLOVIR 400MG TABLET,NDC Description_ACYCLOVIR 800MG TABLET,NDC Description_ADEFOVIR DIPIVOXIL 10MG TABLET ORAL,NDC Description_AFEDITAB CR 30MG TABLET,NDC Description_AFEDITAB CR 60MG TABLET,...,Ingredient_TROSPIUM CHLORIDE,Ingredient_VALPROIC ACID,Ingredient_VALSARTAN,Ingredient_VORICONAZOLE,Ingredient_ZAFIRLUKAST,Ingredient_ZIDOVUDINE,Ingredient_ZOLMITRIPTAN,Ingredient_ZOLPIDEM TARTRATE,Type_OTC,Type_RX
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126972,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
126973,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
126974,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
126975,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.3, random_state = 17)
lr = LinearRegression()
lr.fit(X_train, y_train)
print(f'LR Train: {lr.score(X_train, y_train)}')
print(f'LR Test: {lr.score(X_test, y_test)}')

LR Train: 0.926903481896789
LR Test: -6.147243174148271e+18


In [20]:
test_preds = lr.predict(X_test)
train_preds = lr.predict(X_train)
print(f'Test RMSE: {metrics.mean_squared_error(y_test, test_preds, squared=False)}')


Test RMSE: 33619121272.395153


In [19]:
print(f'Train RMSE: {metrics.mean_squared_error(y_train, train_preds, squared=False)}')


Test RMSE: 3.1971220088093455


In [None]:
# LR Train: 0.926903481896789
# LR Test: -6.147243174148271e+18
# Train RMSE: 3.1971220088093455
# Test RMSE: 33619121272.395153

In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor

sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)


In [12]:
knn = KNeighborsRegressor(n_neighbors = 7)

In [13]:
knn.fit(X_train_sc, y_train)
print(f'Train Score {knn.score(X_train_sc, y_train)}')
print(f'Test Score {knn.score(X_test_sc, y_test)}')

Train Score 0.8915498524974116
Test Score 0.8080358209686297


In [14]:
y_pred = knn.predict(X_test_sc)

In [18]:
print(f'Test RMSE {metrics.mean_squared_error(y_test, y_pred, squared= False)}')

Test RMSE 5.940954948304041


In [8]:
#Train Score 0.8915498524974116
#Test Score 0.8080358209686297
#Test RMSE 5.940954948304041

In [9]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.model_selection import cross_val_score

rf = RandomForestRegressor()
et = ExtraTreesRegressor()

In [10]:
cross_val_score(rf, X_train_sc, y_train).mean()

KeyboardInterrupt: 

In [None]:
cross_val_score(et, X_train_sc, y_train).mean()

In [13]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators': [95,100,105],
    'max_features': ['auto'],
    'max_depth': [3,4,5,6]
}

gs = GridSearchCV(rf, param_grid = params, cv=5)
gs.fit(X_train_sc, y_train)
gs.best_score_

KeyboardInterrupt: 

In [14]:
gs.best_params_

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [17]:
rt = RandomForestRegressor(max_depth = 6, max_features='auto', n_estimators = 105)
rt.fit(X_train, y_train)
print(rt.score(X_train, y_train))
print(rt.score(X_test, y_test))
#train is better but test is slightly worse

0.87318737377666
0.7989923850483017


In [19]:
y_preds_rt = rt.predict(X_test)
print(metrics.mean_squared_error(y_test, y_preds_rt, squared=False))
#slightly worse than KNN

6.0792837755132805


In [None]:
#adaboost
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, VotingRegressor

vote = VotingRegressor([
    ('ada', AdaBoostRegressor()),
    ('gb', GradientBoostingRegressor()),
    ('forest', RandomForestRegressor())
])

params = {
    'ada__n_estimators':[100,150,200],
    'gb__n_estimators':[100,150,200],
    'forest__max_depth':[6,7,8]
}

gs = GridSearchCV(vote, param_grid = params, cv=5)
gs.fit(X_train_sc, y_train)
print(gs.best_score_)
print(gs.best_params_)

In [27]:
#svm


0