# Explore here

In [56]:
# Your code here
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from joblib import dump

In [12]:
url = "https://raw.githubusercontent.com/4GeeksAcademy/regularized-linear-regression-project-tutorial/main/demographic_health_data.csv"
df_raw = pd.read_csv(url)


In [13]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3140 entries, 0 to 3139
Columns: 108 entries, fips to Urban_rural_code
dtypes: float64(61), int64(45), object(2)
memory usage: 2.6+ MB


In [14]:
df_raw.sample(7, random_state=2024)

Unnamed: 0,fips,TOT_POP,0-9,0-9 y/o % of total pop,19-Oct,10-19 y/o % of total pop,20-29,20-29 y/o % of total pop,30-39,30-39 y/o % of total pop,...,COPD_number,diabetes_prevalence,diabetes_Lower 95% CI,diabetes_Upper 95% CI,diabetes_number,CKD_prevalence,CKD_Lower 95% CI,CKD_Upper 95% CI,CKD_number,Urban_rural_code
2190,40123,38247,5275,13.791931,5247,13.718723,5736,14.997255,4855,12.693806,...,2498,11.9,11.1,12.7,3445,3.2,3.0,3.4,925,5
284,8083,26158,3138,11.99633,3147,12.030736,2672,10.214848,2990,11.430537,...,1678,11.4,10.4,12.3,2321,3.6,3.4,3.9,740,6
2514,47177,40878,5029,12.302461,5476,13.395959,4773,11.676207,4915,12.023582,...,4040,15.3,14.1,16.5,4797,3.8,3.5,4.1,1181,5
2476,47101,12086,1346,11.136853,1508,12.477246,1351,11.178223,1315,10.880357,...,1224,15.5,14.3,16.7,1468,3.8,3.5,4.2,364,6
1479,28161,12392,1488,12.007747,1540,12.427373,1486,11.991607,1314,10.603615,...,992,17.1,16.1,18.1,1645,3.9,3.7,4.2,375,6
572,16047,15196,2250,14.806528,2325,15.300079,1800,11.845222,1728,11.371414,...,882,12.7,11.8,13.6,1401,3.4,3.2,3.7,378,6
2956,53011,481857,61427,12.747973,64952,13.479518,60445,12.544178,64304,13.345038,...,21744,9.6,8.8,10.4,35011,2.7,2.5,2.9,9872,2


In [15]:
print(list(df_raw.columns))

['fips', 'TOT_POP', '0-9', '0-9 y/o % of total pop', '19-Oct', '10-19 y/o % of total pop', '20-29', '20-29 y/o % of total pop', '30-39', '30-39 y/o % of total pop', '40-49', '40-49 y/o % of total pop', '50-59', '50-59 y/o % of total pop', '60-69', '60-69 y/o % of total pop', '70-79', '70-79 y/o % of total pop', '80+', '80+ y/o % of total pop', 'White-alone pop', '% White-alone', 'Black-alone pop', '% Black-alone', 'Native American/American Indian-alone pop', '% NA/AI-alone', 'Asian-alone pop', '% Asian-alone', 'Hawaiian/Pacific Islander-alone pop', '% Hawaiian/PI-alone', 'Two or more races pop', '% Two or more races', 'POP_ESTIMATE_2018', 'N_POP_CHG_2018', 'GQ_ESTIMATES_2018', 'R_birth_2018', 'R_death_2018', 'R_NATURAL_INC_2018', 'R_INTERNATIONAL_MIG_2018', 'R_DOMESTIC_MIG_2018', 'R_NET_MIG_2018', 'Less than a high school diploma 2014-18', 'High school diploma only 2014-18', "Some college or associate's degree 2014-18", "Bachelor's degree or higher 2014-18", 'Percent of adults with les

In [16]:
df_interim = (
    df_raw
    .copy()
    .set_axis(
        df_raw.columns.str.replace(' ', '_')
        .str.replace(r'\W','', regex=True)
        .str.lower()
        .str.slice(0,40), axis=1
    )
    .rename(columns={'heart_disease_number':'target'})
)

In [17]:
df = (
    df_interim
    .copy()
    .reindex(
        columns=(
            ['target'] + 
            [c for c in df_interim.columns.to_list() if c not in ['target']]
        )
    )
)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3140 entries, 0 to 3139
Columns: 108 entries, target to urban_rural_code
dtypes: float64(61), int64(45), object(2)
memory usage: 2.6+ MB


In [18]:
df_raw.head(10)

Unnamed: 0,fips,TOT_POP,0-9,0-9 y/o % of total pop,19-Oct,10-19 y/o % of total pop,20-29,20-29 y/o % of total pop,30-39,30-39 y/o % of total pop,...,COPD_number,diabetes_prevalence,diabetes_Lower 95% CI,diabetes_Upper 95% CI,diabetes_number,CKD_prevalence,CKD_Lower 95% CI,CKD_Upper 95% CI,CKD_number,Urban_rural_code
0,1001,55601,6787,12.206615,7637,13.735364,6878,12.370281,7089,12.749771,...,3644,12.9,11.9,13.8,5462,3.1,2.9,3.3,1326,3
1,1003,218022,24757,11.355276,26913,12.344167,23579,10.814964,25213,11.564429,...,14692,12.0,11.0,13.1,20520,3.2,3.0,3.5,5479,4
2,1005,24881,2732,10.980266,2960,11.896628,3268,13.13452,3201,12.865239,...,2373,19.7,18.6,20.6,3870,4.5,4.2,4.8,887,6
3,1007,22400,2456,10.964286,2596,11.589286,3029,13.522321,3113,13.897321,...,1789,14.1,13.2,14.9,2511,3.3,3.1,3.6,595,2
4,1009,57840,7095,12.266598,7570,13.087828,6742,11.656293,6884,11.901798,...,4661,13.5,12.6,14.5,6017,3.4,3.2,3.7,1507,2
5,1011,10138,1198,11.816926,1130,11.146183,1395,13.76011,1446,14.263168,...,873,20.6,19.4,21.9,1651,4.5,4.2,4.8,360,6
6,1013,19680,2351,11.946138,2461,12.505081,2286,11.615854,2337,11.875,...,1741,18.6,17.5,19.5,2841,4.3,4.1,4.6,662,6
7,1015,114277,13141,11.499252,14637,12.808352,15463,13.531157,13865,12.1328,...,9158,14.7,13.6,15.7,13182,3.6,3.3,3.8,3204,4
8,1017,33615,3869,11.509743,3812,11.340176,4380,13.029897,3734,11.108136,...,2870,17.5,16.6,18.5,4664,4.0,3.8,4.2,1068,5
9,1019,26032,2554,9.811002,2981,11.451291,2852,10.955747,2504,9.618931,...,2540,15.9,14.7,17.2,3353,3.9,3.6,4.2,814,6


In [20]:
df_interim = df_interim.drop_duplicates().reset_index(drop = True)
df_interim.head()

Unnamed: 0,fips,tot_pop,09,09_yo__of_total_pop,19oct,1019_yo__of_total_pop,2029,2029_yo__of_total_pop,3039,3039_yo__of_total_pop,...,copd_number,diabetes_prevalence,diabetes_lower_95_ci,diabetes_upper_95_ci,diabetes_number,ckd_prevalence,ckd_lower_95_ci,ckd_upper_95_ci,ckd_number,urban_rural_code
0,1001,55601,6787,12.206615,7637,13.735364,6878,12.370281,7089,12.749771,...,3644,12.9,11.9,13.8,5462,3.1,2.9,3.3,1326,3
1,1003,218022,24757,11.355276,26913,12.344167,23579,10.814964,25213,11.564429,...,14692,12.0,11.0,13.1,20520,3.2,3.0,3.5,5479,4
2,1005,24881,2732,10.980266,2960,11.896628,3268,13.13452,3201,12.865239,...,2373,19.7,18.6,20.6,3870,4.5,4.2,4.8,887,6
3,1007,22400,2456,10.964286,2596,11.589286,3029,13.522321,3113,13.897321,...,1789,14.1,13.2,14.9,2511,3.3,3.1,3.6,595,2
4,1009,57840,7095,12.266598,7570,13.087828,6742,11.656293,6884,11.901798,...,4661,13.5,12.6,14.5,6017,3.4,3.2,3.7,1507,2


In [None]:
data_types = df.dtypes
numeric_col = [i for i in list(data_types[data_types != "object"].index) if i != "target"]

scaler = StandardScaler()
norm_features = scaler.fit_transform(df[numeric_col])

# Crear un nuevo DataFrame con las variables numéricas escaladas
df_scal = pd.DataFrame(norm_features, index=df.index, columns=numeric_col)
df_scal["target"] = df["target"]
df_scal.head()



Unnamed: 0,fips,tot_pop,09,09_yo__of_total_pop,19oct,1019_yo__of_total_pop,2029,2029_yo__of_total_pop,3039,3039_yo__of_total_pop,...,diabetes_prevalence,diabetes_lower_95_ci,diabetes_upper_95_ci,diabetes_number,ckd_prevalence,ckd_lower_95_ci,ckd_upper_95_ci,ckd_number,urban_rural_code,target
0,-1.940874,-0.145679,-0.142421,0.158006,-0.135556,0.573496,-0.153144,0.02761,-0.139384,0.588469,...,-0.063696,-0.07172,-0.089834,-0.129902,-0.609615,-0.582796,-0.669652,-0.147523,-1.082865,3345
1,-1.940742,0.341296,0.287476,-0.242861,0.320383,-0.193107,0.183774,-0.469965,0.23062,-0.1103,...,-0.394103,-0.4149,-0.337677,0.376251,-0.433549,-0.393279,-0.343373,0.389791,-0.420704,13414
2,-1.94061,-0.237785,-0.239429,-0.419441,-0.246181,-0.439718,-0.225971,0.272104,-0.218759,0.656538,...,2.432709,2.483064,2.317776,-0.183415,1.855312,1.880929,1.777443,-0.204321,0.903618,2159
3,-1.940478,-0.245223,-0.246032,-0.426966,-0.254791,-0.609076,-0.230792,0.396168,-0.220555,1.264959,...,0.376846,0.423984,0.299632,-0.229096,-0.257483,-0.203761,-0.180233,-0.2421,-1.745026,1533
4,-1.940346,-0.138966,-0.135053,0.186249,-0.13714,0.216679,-0.155888,-0.200808,-0.14357,0.088582,...,0.156575,0.195197,0.158008,-0.111247,-0.081417,-0.014244,-0.017093,-0.124105,-1.745026,4101


In [28]:
X = df_scal.drop(columns=["target"])
y = df_scal["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
train_indices = list(X_train.index)
test_indices = list(X_test.index)

k = int(len(X_train.columns) * 0.3)
selection_model = SelectKBest(score_func = f_regression, k = k)
selection_model.fit(X_train, y_train)
ix = selection_model.get_support()

X_train_sel = pd.DataFrame(selection_model.transform(X_train), columns = X_train.columns.values[ix])
X_test_sel = pd.DataFrame(selection_model.transform(X_test), columns = X_test.columns.values[ix])

X_train_sel.head()

Unnamed: 0,tot_pop,09,19oct,2029,3039,4049,5059,6069,7079,80,...,family_medicinegeneral_practice_primary_,total_specialist_physicians_2019,total_population,population_aged_60,county_pop2018_18_and_older,anycondition_number,obesity_number,copd_number,diabetes_number,ckd_number
0,-0.232556,-0.227731,-0.234284,-0.232951,-0.226353,-0.231316,-0.229599,-0.233425,-0.23468,-0.23442,...,-0.212643,-0.20859,-0.231195,-0.229737,-0.233171,-0.23437,-0.232975,-0.223516,-0.218609,-0.219329
1,-0.158676,-0.178665,-0.180166,-0.188266,-0.17507,-0.161168,-0.134688,-0.105618,-0.11927,-0.091822,...,-0.11668,-0.11085,-0.150293,-0.098866,-0.152859,-0.142645,-0.155304,-0.11008,-0.131449,-0.130962
2,-0.199114,-0.211128,-0.195138,-0.166782,-0.195036,-0.194045,-0.199725,-0.219256,-0.222207,-0.205154,...,-0.192263,-0.217668,-0.197005,-0.216056,-0.195125,-0.193205,-0.201976,-0.193106,-0.189197,-0.206391
3,-0.036595,-0.037734,-0.017077,-0.057986,-0.052252,-0.033158,-0.020228,-0.032603,-0.023876,-0.046224,...,0.062458,-0.107888,-0.03694,-0.030034,-0.039882,-0.003321,0.006163,-0.007077,-0.047515,-0.045054
4,0.090839,0.09468,0.101662,0.056721,0.042392,0.068095,0.101699,0.144664,0.140685,0.166099,...,0.274818,0.194913,0.097767,0.161314,0.088485,0.165555,0.18274,0.265603,0.12304,0.132454


In [30]:
X_test_sel.sample(7, random_state=2024)

Unnamed: 0,tot_pop,09,19oct,2029,3039,4049,5059,6069,7079,80,...,family_medicinegeneral_practice_primary_,total_specialist_physicians_2019,total_population,population_aged_60,county_pop2018_18_and_older,anycondition_number,obesity_number,copd_number,diabetes_number,ckd_number
27,-0.208318,-0.169526,-0.1879,-0.218123,-0.211246,-0.211323,-0.212414,-0.224848,-0.239215,-0.200594,...,-0.186271,-0.20706,-0.206233,-0.220893,-0.217924,-0.22997,-0.21714,-0.236304,-0.222037,-0.212472
140,-0.231464,-0.220674,-0.236413,-0.22825,-0.218248,-0.223885,-0.230418,-0.250346,-0.239314,-0.241135,...,-0.246896,-0.236373,-0.229581,-0.245406,-0.232633,-0.228231,-0.231699,-0.205003,-0.207953,-0.222951
55,-0.280797,-0.270027,-0.283719,-0.262202,-0.258221,-0.270428,-0.287495,-0.307658,-0.319968,-0.297097,...,-0.303945,-0.286226,-0.278022,-0.311583,-0.281837,-0.29107,-0.290795,-0.315894,-0.262206,-0.272763
1,0.496553,0.433072,0.39217,0.544659,0.453677,0.39148,0.499744,0.668639,0.716353,0.476084,...,0.853184,0.424904,0.477184,0.620724,0.517408,0.52736,0.516364,0.443806,0.418504,0.454092
428,-0.28024,-0.268089,-0.278917,-0.266217,-0.25718,-0.267207,-0.287121,-0.311453,-0.316665,-0.300579,...,-0.306895,-0.284188,-0.278586,-0.310492,-0.282282,-0.296401,-0.298142,-0.314876,-0.269432,-0.277161
86,2.701274,2.899948,3.268346,2.088017,2.719415,3.434899,2.788722,2.191626,2.131735,1.455369,...,2.191753,1.87823,2.472618,1.856779,2.568326,2.261611,2.330951,1.80358,1.982948,1.891371
402,-0.112159,-0.0907,-0.121435,-0.075476,-0.104413,-0.140439,-0.13827,-0.126248,-0.107734,-0.089086,...,-0.061665,-0.138321,-0.112984,-0.113411,-0.114457,-0.11299,-0.124802,-0.125922,-0.11044,-0.089432


In [31]:
X_train_sel["Heart disease_number"] = list(y_train)
X_test_sel["Heart disease_number"] = list(y_test)

X_train_sel.to_csv("../data/processed/clean_train.csv", index = False)
X_test_sel.to_csv("../data/processed/clean_test.csv", index = False)

In [32]:
total_data = pd.concat([X_train_sel, X_test_sel])
total_data.head()

Unnamed: 0,tot_pop,09,19oct,2029,3039,4049,5059,6069,7079,80,...,total_specialist_physicians_2019,total_population,population_aged_60,county_pop2018_18_and_older,anycondition_number,obesity_number,copd_number,diabetes_number,ckd_number,Heart disease_number
0,-0.232556,-0.227731,-0.234284,-0.232951,-0.226353,-0.231316,-0.229599,-0.233425,-0.23468,-0.23442,...,-0.20859,-0.231195,-0.229737,-0.233171,-0.23437,-0.232975,-0.223516,-0.218609,-0.219329,2072
1,-0.158676,-0.178665,-0.180166,-0.188266,-0.17507,-0.161168,-0.134688,-0.105618,-0.11927,-0.091822,...,-0.11085,-0.150293,-0.098866,-0.152859,-0.142645,-0.155304,-0.11008,-0.131449,-0.130962,3796
2,-0.199114,-0.211128,-0.195138,-0.166782,-0.195036,-0.194045,-0.199725,-0.219256,-0.222207,-0.205154,...,-0.217668,-0.197005,-0.216056,-0.195125,-0.193205,-0.201976,-0.193106,-0.189197,-0.206391,2222
3,-0.036595,-0.037734,-0.017077,-0.057986,-0.052252,-0.033158,-0.020228,-0.032603,-0.023876,-0.046224,...,-0.107888,-0.03694,-0.030034,-0.039882,-0.003321,0.006163,-0.007077,-0.047515,-0.045054,5484
4,0.090839,0.09468,0.101662,0.056721,0.042392,0.068095,0.101699,0.144664,0.140685,0.166099,...,0.194913,0.097767,0.161314,0.088485,0.165555,0.18274,0.265603,0.12304,0.132454,8686


In [33]:
X_test_sel.head()

Unnamed: 0,tot_pop,09,19oct,2029,3039,4049,5059,6069,7079,80,...,total_specialist_physicians_2019,total_population,population_aged_60,county_pop2018_18_and_older,anycondition_number,obesity_number,copd_number,diabetes_number,ckd_number,Heart disease_number
0,-0.285286,-0.285362,-0.294836,-0.269566,-0.258568,-0.268541,-0.289649,-0.312989,-0.316763,-0.286734,...,-0.285225,-0.284324,-0.308211,-0.283698,-0.302439,-0.302292,-0.324038,-0.27629,-0.281172,698
1,0.496553,0.433072,0.39217,0.544659,0.453677,0.39148,0.499744,0.668639,0.716353,0.476084,...,0.424904,0.477184,0.620724,0.517408,0.52736,0.516364,0.443806,0.418504,0.454092,13982
2,-0.260191,-0.255123,-0.265837,-0.246628,-0.234723,-0.240703,-0.264552,-0.289867,-0.289846,-0.290962,...,-0.261868,-0.257294,-0.287868,-0.259943,-0.249299,-0.259877,-0.225107,-0.22597,-0.242229,1768
3,0.039389,0.058341,0.059701,-0.018647,0.003236,0.030594,0.074401,0.091003,0.060721,0.005012,...,0.130719,0.036299,0.055281,0.031494,0.035274,0.026108,0.136643,0.003409,0.022352,6739
4,0.364272,0.281232,0.323623,0.525353,0.29599,0.288317,0.298029,0.461297,0.49776,0.351393,...,0.305024,0.336581,0.423969,0.390596,0.271127,0.273318,0.329669,0.25662,0.334804,11305


In [34]:
X_train_sel["target"] = list(y_train)
X_test_sel["target"] = list(y_test)

X_train_sel.to_csv("../data/processed/clean_train.csv", index = False)
X_test_sel.to_csv("../data/processed/clean_test.csv", index = False)

In [35]:
total_data = pd.concat([X_train_sel, X_test_sel])
total_data.head()

Unnamed: 0,tot_pop,09,19oct,2029,3039,4049,5059,6069,7079,80,...,total_population,population_aged_60,county_pop2018_18_and_older,anycondition_number,obesity_number,copd_number,diabetes_number,ckd_number,Heart disease_number,target
0,-0.232556,-0.227731,-0.234284,-0.232951,-0.226353,-0.231316,-0.229599,-0.233425,-0.23468,-0.23442,...,-0.231195,-0.229737,-0.233171,-0.23437,-0.232975,-0.223516,-0.218609,-0.219329,2072,2072
1,-0.158676,-0.178665,-0.180166,-0.188266,-0.17507,-0.161168,-0.134688,-0.105618,-0.11927,-0.091822,...,-0.150293,-0.098866,-0.152859,-0.142645,-0.155304,-0.11008,-0.131449,-0.130962,3796,3796
2,-0.199114,-0.211128,-0.195138,-0.166782,-0.195036,-0.194045,-0.199725,-0.219256,-0.222207,-0.205154,...,-0.197005,-0.216056,-0.195125,-0.193205,-0.201976,-0.193106,-0.189197,-0.206391,2222,2222
3,-0.036595,-0.037734,-0.017077,-0.057986,-0.052252,-0.033158,-0.020228,-0.032603,-0.023876,-0.046224,...,-0.03694,-0.030034,-0.039882,-0.003321,0.006163,-0.007077,-0.047515,-0.045054,5484,5484
4,0.090839,0.09468,0.101662,0.056721,0.042392,0.068095,0.101699,0.144664,0.140685,0.166099,...,0.097767,0.161314,0.088485,0.165555,0.18274,0.265603,0.12304,0.132454,8686,8686


In [36]:
X_test_sel.head()

Unnamed: 0,tot_pop,09,19oct,2029,3039,4049,5059,6069,7079,80,...,total_population,population_aged_60,county_pop2018_18_and_older,anycondition_number,obesity_number,copd_number,diabetes_number,ckd_number,Heart disease_number,target
0,-0.285286,-0.285362,-0.294836,-0.269566,-0.258568,-0.268541,-0.289649,-0.312989,-0.316763,-0.286734,...,-0.284324,-0.308211,-0.283698,-0.302439,-0.302292,-0.324038,-0.27629,-0.281172,698,698
1,0.496553,0.433072,0.39217,0.544659,0.453677,0.39148,0.499744,0.668639,0.716353,0.476084,...,0.477184,0.620724,0.517408,0.52736,0.516364,0.443806,0.418504,0.454092,13982,13982
2,-0.260191,-0.255123,-0.265837,-0.246628,-0.234723,-0.240703,-0.264552,-0.289867,-0.289846,-0.290962,...,-0.257294,-0.287868,-0.259943,-0.249299,-0.259877,-0.225107,-0.22597,-0.242229,1768,1768
3,0.039389,0.058341,0.059701,-0.018647,0.003236,0.030594,0.074401,0.091003,0.060721,0.005012,...,0.036299,0.055281,0.031494,0.035274,0.026108,0.136643,0.003409,0.022352,6739,6739
4,0.364272,0.281232,0.323623,0.525353,0.29599,0.288317,0.298029,0.461297,0.49776,0.351393,...,0.336581,0.423969,0.390596,0.271127,0.273318,0.329669,0.25662,0.334804,11305,11305


In [37]:
train_data = pd.read_csv("../data/processed/clean_train.csv")
test_data = pd.read_csv("../data/processed/clean_test.csv")

train_data.head()

Unnamed: 0,tot_pop,09,19oct,2029,3039,4049,5059,6069,7079,80,...,total_population,population_aged_60,county_pop2018_18_and_older,anycondition_number,obesity_number,copd_number,diabetes_number,ckd_number,Heart disease_number,target
0,-0.232556,-0.227731,-0.234284,-0.232951,-0.226353,-0.231316,-0.229599,-0.233425,-0.23468,-0.23442,...,-0.231195,-0.229737,-0.233171,-0.23437,-0.232975,-0.223516,-0.218609,-0.219329,2072,2072
1,-0.158676,-0.178665,-0.180166,-0.188266,-0.17507,-0.161168,-0.134688,-0.105618,-0.11927,-0.091822,...,-0.150293,-0.098866,-0.152859,-0.142645,-0.155304,-0.11008,-0.131449,-0.130962,3796,3796
2,-0.199114,-0.211128,-0.195138,-0.166782,-0.195036,-0.194045,-0.199725,-0.219256,-0.222207,-0.205154,...,-0.197005,-0.216056,-0.195125,-0.193205,-0.201976,-0.193106,-0.189197,-0.206391,2222,2222
3,-0.036595,-0.037734,-0.017077,-0.057986,-0.052252,-0.033158,-0.020228,-0.032603,-0.023876,-0.046224,...,-0.03694,-0.030034,-0.039882,-0.003321,0.006163,-0.007077,-0.047515,-0.045054,5484,5484
4,0.090839,0.09468,0.101662,0.056721,0.042392,0.068095,0.101699,0.144664,0.140685,0.166099,...,0.097767,0.161314,0.088485,0.165555,0.18274,0.265603,0.12304,0.132454,8686,8686


In [38]:
X_train = train_data.drop(["target"], axis = 1)
y_train = train_data["target"]
X_test = test_data.drop(["target"], axis = 1)
y_test = test_data["target"]

In [41]:
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [42]:
print(f"Intercep (a): {model.intercept_}")
print(f"Coefficients: {model.coef_}")

Intercep (a): [ 1.14288622e-08  1.14154474e-08  1.13841269e-08 ... -3.74590867e-07
 -3.94861370e-07 -5.14809429e-07]
Coefficients: [[-1.69290193e-08 -1.63328637e-08 -1.72510791e-08 ... -1.76199387e-08
  -1.79115894e-08 -2.66039236e-04]
 [-1.69505535e-08 -1.64115961e-08 -1.73289374e-08 ... -1.76105367e-08
  -1.78792400e-08 -2.65734288e-04]
 [-1.68794070e-08 -1.63023376e-08 -1.72209453e-08 ... -1.75591005e-08
  -1.78450460e-08 -2.65022863e-04]
 ...
 [ 1.13716855e-07  1.69863467e-07  2.43162986e-07 ... -1.77594466e-07
   1.42690431e-07  2.63140765e-04]
 [ 1.84403929e-07  1.33400434e-07  1.05720881e-07 ...  7.79342749e-08
   2.16363758e-07  2.63519302e-04]
 [ 4.75996134e-07  3.40194814e-07  3.92010957e-07 ...  6.43601204e-07
   4.88283134e-07  2.66494072e-04]]


In [48]:
reg_lasso = Lasso(alpha=5)
reg_lasso.fit(X_train,y_train)
y_pred = reg_lasso.predict(X_test) 

  model = cd_fast.enet_coordinate_descent(


In [43]:
y_pred = model.predict(X_test)
y_pred

array([434790, 434790, 434790, 434790, 434790, 434790, 434790, 434790,
       434790, 434790, 434790, 434790, 434790, 434790, 434790, 434790,
       434790, 434790, 434790, 434790, 434790, 434790, 434790, 434790,
       434790, 434790, 434790, 434790, 434790, 434790, 434790, 434790,
       434790, 434790, 434790, 434790, 434790, 434790, 434790, 434790,
       434790, 434790, 434790, 434790, 434790, 434790, 434790, 434790,
       434790, 434790, 434790, 434790, 434790, 434790, 434790, 434790,
       434790, 434790, 434790, 434790, 434790, 434790, 434790, 434790,
       434790, 434790, 434790, 434790, 434790, 434790, 434790, 434790,
       434790, 434790, 434790, 434790, 434790, 434790, 434790, 434790,
       434790, 434790, 434790, 434790, 434790, 434790, 434790, 434790,
       434790, 434790, 434790, 434790, 434790, 434790, 434790, 434790,
       434790, 434790, 434790, 434790, 434790, 434790, 434790, 434790,
       434790, 434790, 434790, 434790, 434790, 434790, 434790, 434790,
      

In [49]:
reg_mlin = LinearRegression()
reg_mlin.fit(X_train, y_train)
y_pred_m = reg_mlin.predict(X_test)

In [52]:
print(f'MSE: {mean_squared_error(y_test, y_pred_m)}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_m))}')

print(f'MAE: {mean_absolute_error(y_test, y_pred_m)}')
print(f'R2: {r2_score(y_test, y_pred_m)}')

MSE: 3.2691912133166113e-24
RMSE: 1.8080904881439455e-12
MAE: 1.0809753658363052e-12
R2: 1.0


In [54]:
alpha = 1.0
lasso_model = Lasso(alpha = alpha)

lasso_model.fit(X_train, y_train)

score = lasso_model.score(X_test, y_test)
print("Coefficients:", lasso_model.coef_)
print("R2 score:", score)

Coefficients: [ 3.33577112e+02  8.45394692e+02 -6.89491582e+02 -1.52429254e+02
 -2.14957833e+02  1.52698904e+03  7.97261114e+01 -2.02909224e+02
  1.25048835e+03  6.02928906e+02  6.37066624e+02 -1.91767297e+03
 -1.72879192e+02 -2.78088841e+02  0.00000000e+00  7.07820368e+02
  2.39844331e+02  2.01289921e+01 -7.78739755e+02  0.00000000e+00
  2.03658247e+02 -1.63845819e+02 -2.10174123e+02 -1.61255937e+03
  9.64632173e+02  4.89045493e+02  2.49974345e+02  1.50987946e+03
  2.82186155e+03  1.08781768e+03 -3.54680957e+02  5.67633698e-01]
R2 score: 0.9996525087522968


  model = cd_fast.enet_coordinate_descent(


In [57]:
dump(lasso_model, open("../models/lasso_alpha-1.0.sav", "wb"))