In [1]:
import django
import os
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "kML.settings")
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"
django.setup()

In [2]:
from regml.models import RegData, ColumnTypes, DataOutput, FileMetaData


In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import chi2, f_regression
from pandas.api.types import is_numeric_dtype
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.offline as opy
import plotly.express as px
import math
import networkx as nx
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline

In [23]:
title = 'boston project2'

In [27]:
data_dict = {}
x_cols = []
y_cols = []
col_types = {
            'n': [],
            'c': [],
            'd': [],
            'int': []
}

In [28]:
# Retrieve columns
for row in ColumnTypes.objects.all().filter(project_name=title):
    if row.y:
        y_cols.append(row.col_name)
        col_types[row.col_type].append(row.col_name)
    else:
        x_cols.append(row.col_name)
        col_types[row.col_type].append(row.col_name)

In [29]:
# Retrieve observations
for i, row in enumerate(RegData.objects.all().filter(project_name=title)):
    data_dict[i] = row.observations

In [30]:
# Build DF
df = pd.DataFrame.from_dict(data_dict, orient='index')
for col in col_types['n']:
    if not is_numeric_dtype(df[col]):
        try:
            df[col] = pd.to_numeric(df[col])
        except ValueError as e:
            col_types['n'].remove(col)
            col_types['c'].append(col)

for col in col_types['c']:
    try:
        df[col] = pd.to_numeric(df[col])
    except ValueError as e:
        pass
    else:
        col_types['int'].append(col)
col_types['int'].extend(col_types['n'])

In [33]:
# Save Corr Matrix
corr = df[col_types['int']].corr().reset_index()
# checking if exists
existing = DataOutput.objects.filter(project_name=title, output_name='corr_matrix').exists()
if existing:
    DataOutput.objects.filter(project_name=title, output_name='corr_matrix').delete()
# saving corr matrix to plot in java script
DataOutput(output=pd.melt(corr, id_vars='index').to_dict(orient='records'), output_name='corr_matrix',
                   project_name=FileMetaData.objects.get(project_name=title)).save()

In [115]:
corr_new = corr.set_index('index')
fig = go.Figure(data=go.Heatmap(
                    x=corr_new.columns,
                    y=corr_new.index,
                    z=corr_new,
colorscale=[(0, "#ff9900"), (0.5, 'white'), (1, "#2D3949")]))
fig.update_layout(showlegend=False, title_text=f"Feature Correlation Matrix",
                          template="presentation")
fig.update_yaxes(title=None)
fig.update_xaxes(tickangle=45)
fig.update_layout(
    font_family="Gravitas One",
    font_color="#2D3949",
)

fig.show()

In [36]:
#var comes from col_types['int']
var = 'zn'
extract_df = df[[var, y_cols[0]]]
normalized_df = (extract_df - extract_df.mean()) / extract_df.std()
if normalized_df.shape[0] > 1000:
    normalized_df = normalized_df.sample(1000)
target = y_cols[0]    
fig = px.scatter(normalized_df, x=var, y=target)
fig.update_layout(showlegend=False, title_text=f"Scatter plot of {var} and the target {target}",
                          template="presentation")
fig.update_traces(marker=dict(size=5,
                              line=dict(width=2,
                                        color='#2D3949')),
                  selector=dict(mode='markers'))
fig.update_layout(
    font_family="Gravitas One",
    font_color="#2D3949",
)
fig.show()

In [88]:
# all plots
normalized_df = (df[col_types['int']] -
                        df[col_types['int']].mean()) / df[col_types['int']].std()
if normalized_df.shape[0] > 500:
    normalized_df = normalized_df.sample(500)
x = y_cols[0]
temp_cols = list(normalized_df.columns)
temp_cols.remove(x)
len(temp_cols)
    
def multi_plot(df, columns, y, addAll = False):
    fig = go.Figure()

    for column in columns:
        fig.add_trace(
            go.Scatter(
                x = df[column],
                y = df[y],
                name = column,
                mode='markers'
            )
        )


    def create_layout_button(column):
        return dict(label = column,
                    method = 'update',
                    args = [{'visible': df.columns.isin([column]),
                             'title': column,
                             'showlegend': False}])

    fig.update_layout(
        updatemenus=[go.layout.Updatemenu(
            active = 0,
            buttons = (list(df.columns.map(lambda column: create_layout_button(column)))
            )
        )])
    
    fig.show()

multi_plot(normalized_df, temp_cols, 'medv')

In [38]:
# f-scores
label_encoder = LabelEncoder()
for col in col_types['c']:
    df[col] = label_encoder.fit_transform(df[col])
new_l = col_types['n'][:]
new_l.extend(col_types['c'])
y = y_cols[0]
new_l.remove(y)
X = df[new_l]
y = df[y]
f_scores = f_regression(X, y, center=True)
p_values = pd.Series(f_scores[1], index=X.columns) \
        .sort_values(ascending=False)

fig = go.Figure([go.Bar(x=p_values.index, y=p_values.values)])
fig.update_traces(marker_color="#ff9900", marker_line_color='#2D3949',
                          marker_line_width=1.5, opacity=0.8)
fig.update_layout(showlegend=False, title_text=f"F-scores - Categorical and Numeric Features",
                          template="presentation")
fig.update_xaxes(tickangle=45)
fig.update_layout(
    font_family="Gravitas One",
    font_color="#2D3949",
)
fig.show()

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import chi2, f_regression
from pandas.api.types import is_numeric_dtype
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.offline as opy
import plotly.express as px
import math
import networkx as nx
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline

In [None]:
title = 'boston project'

In [None]:
data_dict = {}
x_cols = []
y_cols = []
col_types = {
            'n': [],
            'c': [],
            'd': [],
            'int': []
}

In [None]:
# Retrieve columns
for row in ColumnTypes.objects.all().filter(project_name=title):
    if row.y:
        y_cols.append(row.col_name)
        col_types[row.col_type].append(row.col_name)
    else:
        x_cols.append(row.col_name)
        col_types[row.col_type].append(row.col_name)

In [None]:
# Retrieve observations
for i, row in enumerate(RegData.objects.all().filter(project_name=title)):
    data_dict[i] = row.observations

In [None]:
# Build DF
df = pd.DataFrame.from_dict(data_dict, orient='index')
for col in col_types['n']:
    if not is_numeric_dtype(df[col]):
        try:
            df[col] = pd.to_numeric(df[col])
        except ValueError as e:
            col_types['n'].remove(col)
            col_types['c'].append(col)

for col in col_types['c']:
    try:
        df[col] = pd.to_numeric(df[col])
    except ValueError as e:
        pass
    else:
        col_types['int'].append(col)
col_types['int'].extend(col_types['n'])

In [None]:
df.head()

In [None]:
# Save Corr Matrix
corr = df[col_types['int']].corr().reset_index()
# checking if exists
existing = DataOutput.objects.filter(project_name=title, output_name='corr_matrix').exists()
if existing:
    DataOutput.objects.filter(project_name=title, output_name='corr_matrix').delete()
# saving corr matrix to plot in java script
DataOutput(output=pd.melt(corr, id_vars='index').to_dict(orient='records'), output_name='corr_matrix',
                   project_name=FileMetaData.objects.get(project_name=title)).save()

In [None]:
fig = px.imshow(corr.set_index('index'), 
               color_continuous_scale=[(0, "#ff9900"), (0.5, 'white'), (1, "#2D3949")],
               )
fig.update_layout(showlegend=False, title_text=f"Feature Correlation Matrix",
                          template="presentation")
fig.update_yaxes(title=None)
fig.update_xaxes(tickangle=45)
fig.update_layout(
    font_family="Gravitas One",
    font_color="#2D3949",
)

fig.show()

In [None]:
#var comes from col_types['int']
var = 'zn'
extract_df = df[[var, y_cols[0]]]
normalized_df = (extract_df - extract_df.mean()) / extract_df.std()
if normalized_df.shape[0] > 1000:
    normalized_df = normalized_df.sample(1000)
target = y_cols[0]    
fig = px.scatter(normalized_df, x=var, y=target)
fig.update_layout(showlegend=False, title_text=f"Scatter plot of {var} and the target {target}",
                          template="presentation")
fig.update_traces(marker=dict(size=5,
                              line=dict(width=2,
                                        color='#2D3949')),
                  selector=dict(mode='markers'))
fig.update_layout(
    font_family="Gravitas One",
    font_color="#2D3949",
)
fig.show()

In [None]:
# all plots
normalized_df = (df[col_types['int']] -
                        df[col_types['int']].mean()) / df[col_types['int']].std()
if normalized_df.shape[0] > 500:
    normalized_df = normalized_df.sample(500)
x = y_cols[0]
temp_cols = list(normalized_df.columns)
temp_cols.remove(x)
len(temp_cols)

fig = make_subplots(rows=math.ceil(len(temp_cols) / 2), cols=2, start_cell="bottom-left",
                            subplot_titles=tuple(temp_cols))
rows = 0
for i, y in enumerate(temp_cols):
    if (i + 1) % 2 == 0:
        cols = 2
    else:
        cols = 1
        rows += 1
    fig.add_trace(go.Scatter(x=normalized_df[x], y=normalized_df[y], mode='markers'), row=rows, col=cols)
fig.update_layout(showlegend=False, title_text=f"Linear Relationship of {x} (x axis) and features (y axis)",
                          template="ygridoff")
fig.show()

In [None]:
# f-scores
label_encoder = LabelEncoder()
for col in col_types['c']:
    df[col] = label_encoder.fit_transform(df[col])
new_l = col_types['n'][:]
new_l.extend(col_types['c'])
y = y_cols[0]
new_l.remove(y)
X = df[new_l]
y = df[y]
f_scores = f_regression(X, y, center=True)
p_values = pd.Series(f_scores[1], index=X.columns) \
        .sort_values(ascending=False)

fig = go.Figure([go.Bar(x=p_values.index, y=p_values.values)])
fig.update_traces(marker_color="#ff9900", marker_line_color='#2D3949',
                          marker_line_width=1.5, opacity=0.8)
fig.update_layout(showlegend=False, title_text=f"F-scores - Categorical and Numeric Features",
                          template="presentation")
fig.update_xaxes(tickangle=45)
fig.update_layout(
    font_family="Gravitas One",
    font_color="#2D3949",
)
fig.show()

In [76]:
cols_to_keep = ['b', 'nox', 'chas', 'ptratio', 'medv']
y = y_cols[0]
df.head()

Unnamed: 0,b,rm,zn,age,dis,nox,rad,tax,chas,crim,medv,indus,lstat,ptratio
0,396.9,6.03,0.0,80.8,2.505,0.573,1,273,0,0.04741,11.9,11.93,7.88,21.0
1,393.45,6.794,0.0,89.3,2.3889,0.573,1,273,0,0.10959,22.0,11.93,6.48,21.0
2,396.9,6.976,0.0,91.0,2.1675,0.573,1,273,0,0.06076,23.9,11.93,5.64,21.0
3,396.9,6.12,0.0,76.7,2.2875,0.573,1,273,0,0.04527,20.6,11.93,9.08,21.0
4,391.99,6.593,0.0,69.1,2.4786,0.573,1,273,0,0.06263,22.4,11.93,9.67,21.0


In [None]:
# ML

In [77]:
class RegModel:

    def __init__(self):
        self.categorical_cols = None
        self.numeric_cols = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None

    def split_train_test(self, df, y_col, col_to_keep, numeric_features, categorical_features):
        df = df[col_to_keep]
        X, y = df.drop(y_col, axis=1), df[y_col].astype(int)
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, random_state=42, test_size=0.2)
        self.numeric_cols = list(set(numeric_features).intersection(df.columns))
        self.categorical_cols = list(set(categorical_features).intersection(df.columns))

    def transform_cols(self):
        numeric_transformer = StandardScaler()
        categorical_transformer = OneHotEncoder(handle_unknown='ignore')
        preprocessor = make_column_transformer(
            (numeric_transformer, self.numeric_cols),
            (categorical_transformer, self.categorical_cols),
            remainder='passthrough')
        return preprocessor, numeric_transformer

    def train_pipeline(self, preprocessor, numeric_transformer):
        rf_pipeline = make_pipeline(preprocessor,
                                    RandomForestRegressor(random_state=42, n_estimators=50))
        gradient_pipeline = make_pipeline(
            preprocessor,
            HistGradientBoostingRegressor(random_state=0))
        regressor = make_pipeline(preprocessor,
                                  LinearRegression())
        ridge_reg = RidgeCV([1e-3, 1e-2, 1e-1, 1])
        poly_reg = PolynomialFeatures(degree=2, include_bias=False)
        poly_pipeline = Pipeline([
            ("poly_features", poly_reg),
            ("std_scaler", numeric_transformer),
            ('regul_reg', ridge_reg)])
        return rf_pipeline, gradient_pipeline, regressor, poly_pipeline

    def plot_model_performance(self):
        pass

    def run(self):
        self.X_self.split_train_test()
        preprocessor, numeric_transformer = self.transform_cols()
        self.train_pipeline(preprocessor, numeric_transformer)

In [78]:
reg_cl = RegModel()
reg_cl.split_train_test(df, y, cols_to_keep, col_types['n'], col_types['c'])

Unnamed: 0,b,nox,chas,ptratio
173,393.68,0.472,0,18.4
274,349.48,0.614,0,20.2
491,380.02,0.538,0,21.0
72,88.27,0.693,0,20.2
452,396.90,0.439,0,16.8
...,...,...,...,...
412,396.90,0.449,0,18.5
436,396.42,0.544,0,18.4
411,392.30,0.449,0,18.5
86,396.90,0.693,0,20.2


In [141]:
reg_cl.X_test

Unnamed: 0,b,nox,chas,ptratio
173,393.68,0.472,0,18.4
274,349.48,0.614,0,20.2
491,380.02,0.538,0,21.0
72,88.27,0.693,0,20.2
452,396.90,0.439,0,16.8
...,...,...,...,...
412,396.90,0.449,0,18.5
436,396.42,0.544,0,18.4
411,392.30,0.449,0,18.5
86,396.90,0.693,0,20.2


In [142]:
reg_cl.y_test

173    36
274    12
491    18
72      7
452    23
       ..
412    23
436    16
411    26
86      5
75     17
Name: medv, Length: 102, dtype: int64