<a href="https://colab.research.google.com/github/Dimildizio/DS_course/blob/main/gradient-boosting/notebooks/Housing_prices_kaggle_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
#!pip install catboost

In [47]:
import ftplib
import io
import itertools
import numpy as np
import pandas as pd
import seaborn as sns

#from catboost import CatBoostRegressor
from datetime import datetime
from google.colab import files
from lightgbm import LGBMRegressor
from matplotlib import pyplot as plt
from random import shuffle
from scipy import stats
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, StackingRegressor
from sklearn.linear_model import LassoLarsCV, LinearRegression, SGDRegressor, \
LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder,\
Normalizer
from sklearn.tree import DecisionTreeRegressor


In [48]:
pd.set_option('display.max_rows', 100)
pd.set_option('max_colwidth', 80)

In [49]:
raw_df_train = pd.read_csv('https://raw.githubusercontent.com/Dimildizio/DS_course/main/gradient-boosting/data/train.csv', index_col=0)
raw_df_test = pd.read_csv('https://raw.githubusercontent.com/Dimildizio/DS_course/main/gradient-boosting/data/test.csv', index_col=0)


In [50]:
def is_cat(df, col):
  '''checks if a column is of object or category type'''
  return df[col].dtype in ['object', 'category']

In [51]:
def improve_cats(dataframe) -> pd.DataFrame:
  '''turns dtypes 64->32 and object->category'''
  df = dataframe.copy()
  for col in df.columns:
    if df[col].dtype == 'int64':
      df[col] = df[col].astype('int32')
    elif df[col].dtype == 'float64':
      df[col] = df[col].astype('float32')
    elif df[col].dtype == 'object':
      df[col] = df[col].astype('category')
    else:
      print('Unknown data type')
      return
  return df

In [52]:
#Change after we get encoding
def get_valid_cols(df, ok_cols, to_drop=[]):
  '''selects all coumns that are not in handpicked list, shouldn't be dropped and not categorical''' 
  return [x for x in df.columns if x not in ok_cols+to_drop and not is_cat(df, x)] #Change when cats are encoded

In [53]:
def split_data(df, target):
  '''splits data'''
  X = df.drop(target, axis=1)
  y = df[target]
  return train_test_split(X, y, test_size=0.2, random_state=13)
  

In [120]:
#Could run separate times for y_pred_train, y_pred_test but training two could take longer 
def run_model(X_train, y_train, X_test, model = LinearRegression):
  '''creates a pipeline for a model and predicts the results'''
  
  estimators = [('LinReg',LinearRegression()), ('Lasso',LassoLarsCV(max_iter=11,eps=0.01)), ('Tree',DecisionTreeRegressor(max_depth=5)),
                ('LGBR', LGBMRegressor(max_depth = 5, learning_rate = 0.04))]#, ('KNN', KNeighborsRegressor))]
                #'Bag',BaggingRegressor(estimator=LinearRegression(), n_estimators=5, random_state=42)),

  pipe = make_pipeline(StandardScaler(), LinearRegression())#StackingRegressor(estimators=estimators, final_estimator=  RandomForestRegressor(max_depth = 6))) #RandomForestRegressor(max_depth=7)))
                       #KNeighborsRegressor(n_neighbors = 10))#BaggingRegressor(estimator=LinearRegression(), n_estimators=5, random_state=42))#LogisticRegressionCV())#( max_depth=10))
  pipe.fit(X_train, y_train)

  y_pred_train = pipe.predict(X_train)
  y_pred_test = pipe.predict(X_test)
  return y_pred_train, y_pred_test, pipe

In [121]:
def split_run_test(df, target = 'SalePrice'):
  '''splits data and runs model'''
  X_train, X_test, y_train, y_test = split_data(df, target)
  y_pred_train, y_pred_test, model = run_model(X_train, y_train, X_test)

  text = print_scores(y_train, y_test, y_pred_train, y_pred_test, len(X_train.columns), print_train=True)
  return model, text

In [122]:
def adjusted_r2(yt,yp, colnum):
  #computes adjusted r2 score
   return 1 - (1 - r2_score(yt, yp)) * ((yt.shape[0]-1) / (yt.shape[0] - colnum))

In [123]:
def print_scores(y_train, y_test, y_pred_train, y_pred_test, colnum, print_train=False):
  '''outprints adjusted r2 and rmse results on train and validation'''
  train_r2 =  adjusted_r2(y_train, y_pred_train, colnum)
  test_r2 = adjusted_r2(y_test,y_pred_test, colnum)
  train_rmse = np.sqrt(mean_squared_error(np.log(y_pred_train), np.log(y_train)))
  test_rmse = np.sqrt(mean_squared_error(np.log(y_pred_test), np.log(y_test)))

  text = f'train r2: {train_r2}\ntrain rmse: {train_rmse}\n\ntest r2: {test_r2}\ntest rmse: {test_rmse}\n'

  '''#if print_train:
  #print('train: r2', r2_score(y_train, y_pred_train))
  print('train adjusted r2:', train_r2)
  print("train: root mean squared error:", train_rmse)
  print('Test:')
  #print('test: r2', r2_score(y_test, y_pred_test))
  print('test adjusted r2:', test_r2)
  print("test: root mean squared error:", test_rmse)'''
  print(text)
  return text

In [124]:
def find_best_param(dataset: pd.DataFrame, valid_cols: list, ok_cols: list, to_drop = [], r2_threshold = 0.67, target = "SalePrice"):
  '''loops through all valid columns tries to find columns that give the best r2 result if added to handpicked ones'''

  dataset = dataset.copy()
  best_col = ['', r2_threshold]
  
  for col in valid_cols:
    test_cols = ok_cols + [col]
    df = dataset[test_cols]
  
    try:
      X_train, X_test, y_train, y_test = split_data(df, target)
      y_pred_train, y_pred_test, _ = run_model(X_train, y_train, X_test)

      r2 = adjusted_r2(y_test, y_pred_test, X_train.shape[1])
      if r2 > r2_threshold:
        r2_threshold = r2
        best_col = [col, r2]
        #print_scores(y_train, y_test, y_pred_train, y_pred_test)
    except ValueError:
      continue
      #print(col, 'has NaN')
  
  result = ok_cols + [best_col[0]]
  #print('Best result:', result, best_col[1])
  return best_col


In [125]:
def get_best_cols(df, ok_cols: list, to_drop =[],  r2_threshold=0.67):
  '''shuffles the columns once and finds the set of columns that has the best r2'''
  valid_cols = get_valid_cols(df_cats, ok_cols, to_drop)#
  valid_nodups = valid_cols.copy()
  shuffle(valid_cols)
  #best_result = r2_threshold
  for col in valid_cols:
    #print(new_valid)
    best_col, r2 = find_best_param(df_cats, valid_nodups, ok_cols, r2_threshold)
    if r2 > r2_threshold:# > 0.001:
      r2_threshold = r2
      ok_cols.append(best_col)
      valid_nodups.remove(col)
  return ok_cols, r2_threshold


In [126]:
def many_shuffles(df, my_cols = ['SalePrice', 'OverallQual'], to_drop = [], num=10, threshold = 0.67):
  '''runs shuffle several times to find the best order of columns sets'''
  bestestbest = [my_cols, threshold]
  for x in range(num):
    print('shuffle:', x+1)
    result = get_best_cols(df, my_cols.copy(), to_drop)
    if result[1] > bestestbest[1]:
      bestestbest = result
      print(bestestbest)
  print('\n\nThe result is:')
  print(bestestbest)
  return bestestbest

In [127]:
def create_new_cols(df):
  #create Total SF
  df["TotalSF"] = df["1stFlrSF"] + df["2ndFlrSF"] + df["TotalBsmtSF"] 
  #create Porch
  df['PorchSF'] = df['OpenPorchSF'] + df['EnclosedPorch'] + df['3SsnPorch'] + df['ScreenPorch']
  #Create green area
  df["OutsideArea"] = df["LotArea"] - df["GrLivArea"] - df["GarageArea"] 
  #Create month sold * year
  df['MonthSold'] = df['YrSold']*12 + df['MoSold'] #-df['YrSold'].min()
  #dates_frames  = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold','MoSold']

  cols = ['KitchenAbvGr', 'BedroomAbvGr']

  #Create booleans
  bsmt = df['TotalBsmtSF']+df['BsmtFinSF1'] + df['BsmtFinSF2'] + df['BsmtUnfSF']
  df['HasBsmt'] = bsmt.apply(lambda x: 1 if x > 0 else 0)  
  df['Modern'] = df[['YearRemodAdd', 'YearBuilt']].max(axis=1).apply(lambda x: 1 if x > 2000 else 0)
  df['Has2Floors'] = df['2ndFlrSF'].apply(lambda x: 1 if x>0 else 0)
  df['HasGarage'] = df['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
  df['HasPool'] = df['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
  df['HasVnr'] = df['MasVnrArea'].apply(lambda x: 1 if x > 0 else 0)
  #create bath
  fullbsmtb = df['BsmtFullBath'].apply(lambda x: x if x > 0 else 0)
  halfbsmtb = df['BsmtHalfBath'].apply(lambda x: x*0.5 if x > 0 else 0)
  fullb = df['FullBath'].apply(lambda x: x if x > 0 else 0)
  halfb = df['HalfBath'].apply(lambda x: x*0.5 if x > 0 else 0)
  df['Bath'] = fullbsmtb + halfbsmtb + fullb + halfb

  return df
  
# What to do with rooms?
#df[['TotRmsAbvGrd', 'KitchenAbvGr','BedroomAbvGr','FullBath', 'HalfBath','BsmtFullBath','BsmtHalfBath']].head(10)

In [128]:
def drop_cols(df):
  #drop baths
  df = df.drop(columns = ['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath'])
  #drop bsmt qual
  #df = df.drop(columns = ['1stFlrSF', '2ndFlrSF', 'TotalBsmtSF'])
  df = df.drop(columns=['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF'])
  #drop porch
  df = df.drop(columns = ['OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch'])
  #drop garage
  df = df.drop(columns=['GarageCars'])                                              
  #drop misc
  df = df.drop(columns=['LowQualFinSF', 'PoolArea', 'MiscVal', 'TotRmsAbvGrd', '1stFlrSF'])   
  #drop separate year and month
  #df_num = df_num.drop(columns = ['YrSold', 'MoSold'])

  return df  

In [129]:
def fill_rows(df):
  '''fills with values rows that have NaNs'''
  df['GarageYrBlt'] = df['GarageYrBlt'].fillna(df['GarageYrBlt'].mean()) #REPLACE
  return df
  

In [130]:
def log_cols(df, log_cols):
  '''adds 1 to all numeric columns and np.logs handpicked columns'''
  for column in [col for col in df.columns if not is_cat(df, col)]:
    df[column] = df[column]+1

  for col in log_cols:
    df[col] = np.log(df[col])
  return df

In [131]:
def drop_n_log(df, testdf=False):
  '''runs drop, create new, log, fill functions'''
  df_num = df.copy()

  #Create new columns
  df_num = create_new_cols(df_num)
  #drop columns 
  df_num = drop_cols(df_num)
  df_num = fill_rows(df_num)



  #logs selected cols 
  target = [] if testdf else ['SalePrice']
  to_log = target + ['LotFrontage', 'LotArea', 'GrLivArea']#, 'OutsideArea']#, 'TotalSF']#, 'GarageArea','WoodDeckSF', 'MasVnrArea','TotalBsmtSF','2ndFlrSF', 'PorchSF']#]#,]#, ]#, ]#]]
  df_num = log_cols(df_num, to_log)

  print(df_num.shape)
  return df_num

In [132]:
#Cut iqr
def cut_iqr(df, col, mult):
  '''cuts iqr*3 ouliers'''
  d=df[col].describe()
  val =(d['50%'] + (d['75%']-d['25%'])) * mult
  return df[df[col] <= val]

def plotme(df, col):
    if col != 'SalePrice':
      sns.scatterplot(y = df['SalePrice'], x = df[col])

def make_iqr(df_num):
  '''cuts iqr for selected columns and plots the graph'''
  df_iqr = df_num.copy()

  #for col in [x for x in df_num.columns if not is_cat(df_num, x)]:
  for col in ['SalePrice', 'LotFrontage', 'LotArea', 'MasVnrArea', \
              'TotalBsmtSF', 'GrLivArea', \
              'BedroomAbvGr', 'KitchenAbvGr', 'Fireplaces', 'GarageArea', 'PorchSF']:
    df_iqr = cut_iqr(df_iqr, col, 3)
    #plotme(df_iqr, col)

  print(df_iqr.shape)
  return df_iqr

In [133]:
#we need to remove empty values in categories if there are ones 
def cut_cats(df):
  '''removes unused cats'''
  for catcol in df.columns:
    if is_cat(df, catcol):
      df[catcol].cat.remove_unused_categories() 
  return df

def create_mt_catcol(df_new):
  '''if cat col has NaNs creates a MISSING category and assigns taht values to NaN cell'''
  df = df_new.copy()
  for col in df.columns:
      if is_cat(df, col):
        if df[col].isna().any():
          df[col] = df[col].cat.add_categories(['MISSING'])
          df[col] = df[col].fillna('MISSING')
  df=cut_cats(df)
  return df

def factorize_cats(df):
  '''factorizes a cat column'''
  df = df.copy()
  new_df=df.select_dtypes(include=['category']).apply(lambda x: x.factorize()[0])  
  print(type(new_df))
  for col in new_df.columns:
    #print(col)
    df[col] = new_df[col]
  print(df.shape)
  return df

def make_cats(df_iqr):
  '''drops cat columns and runs other cat processing funcitons'''
  df_cats = df_iqr.drop(columns = [col for col in df_iqr.columns if is_cat(df_iqr,col)])
  '''d = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'Utilities', 'Condition1', 
                        'Condition2', 'Street', 'LandContour', 'LandSlope', 
                  'RoofMatl', 'ExterCond', 'BsmtCond', 'BsmtFinType2', 'Heating', 'CentralAir', 
                  'Electrical', 'Functional', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType']#, axis = 1)
  df_cats = df_iqr.drop(columns = [col for col in d])  #drop cols'''
  df_cats = create_mt_catcol(df_cats)                  #fill NaNs with MISSING

  #df_cats =  factorize_cats(df_cats)                   #factorize
  print(df_cats.shape)
  return df_cats

In [134]:
df = improve_cats(raw_df_train)
df_test = improve_cats(raw_df_test)

In [135]:
df_num = drop_n_log(df)

(1460, 74)


In [136]:
df_iqr= make_iqr(df_num)

(1140, 74)


In [137]:
df_cats = make_cats(df_iqr)

(1140, 31)


In [138]:
#model.predict()

In [139]:
#Find second best parameter
my_cols = ['SalePrice']#, 'OverallQual']
valids = get_valid_cols(df_cats, my_cols, ['OverallQual'])
find_best_param(df_cats, valids, my_cols, ['OverallQual'], r2_threshold = 0.5)

['TotalSF', 0.629695738232914]

In [140]:
#test on best combination so far
split_run_test(df_cats[['SalePrice', 'OverallQual', 'TotalSF', 'GarageArea', 'LotArea', 'Bath', 'Fireplaces', 'YearRemodAdd', 'HasGarage', 'YearBuilt', 'OverallCond', 'GrLivArea', 'BedroomAbvGr', 'YrSold', 'Modern', 'Has2Floors', '2ndFlrSF', 'WoodDeckSF', 'GarageYrBlt', 'HasVnr', 'LotArea']])

train r2: 0.913873284321411
train rmse: 0.009750143451649896

test r2: 0.8749025859807202
test rmse: 0.010963007613555209



(Pipeline(steps=[('standardscaler', StandardScaler()),
                 ('stackingregressor',
                  StackingRegressor(estimators=[('LinReg', LinearRegression()),
                                                ('Lasso',
                                                 LassoLarsCV(eps=0.01,
                                                             max_iter=11)),
                                                ('Tree',
                                                 DecisionTreeRegressor(max_depth=5)),
                                                ('GD', SGDRegressor())],
                                    final_estimator=RandomForestRegressor(max_depth=7)))]),
 'train r2: 0.913873284321411\ntrain rmse: 0.009750143451649896\n\ntest r2: 0.8749025859807202\ntest rmse: 0.010963007613555209\n')

In [141]:
#get current best combination 
mymod = get_best_cols(df_cats, ['SalePrice', 'OverallQual'], r2_threshold = 0.67)
print(mymod)
split_run_test(df_cats[mymod[0]])



(['SalePrice', 'OverallQual', 'TotalSF', 'Bath', 'YearRemodAdd', 'MSSubClass', 'MoSold', 'MSSubClass', 'Fireplaces', 'YrSold', 'LotArea', 'HasGarage', 'GarageYrBlt', 'YearRemodAdd', 'Modern'], 0.8715332421455451)
train r2: 0.8856417824389361
train rmse: 0.011331824366064136

test r2: 0.8671680901658777
test rmse: 0.011489959922312611



(Pipeline(steps=[('standardscaler', StandardScaler()),
                 ('stackingregressor',
                  StackingRegressor(estimators=[('LinReg', LinearRegression()),
                                                ('Lasso',
                                                 LassoLarsCV(eps=0.01,
                                                             max_iter=11)),
                                                ('Tree',
                                                 DecisionTreeRegressor(max_depth=5)),
                                                ('GD', SGDRegressor())],
                                    final_estimator=RandomForestRegressor(max_depth=7)))]),
 'train r2: 0.8856417824389361\ntrain rmse: 0.011331824366064136\n\ntest r2: 0.8671680901658777\ntest rmse: 0.011489959922312611\n')

In [142]:
#get best orders of best combinations
result = many_shuffles(df_cats, num=5, threshold = 0.67)

shuffle: 1




(['SalePrice', 'OverallQual', 'TotalSF', 'YearRemodAdd', 'Bath', 'LotArea', 'HasGarage', 'HasGarage', 'LotArea'], 0.8675013858250176)
shuffle: 2




shuffle: 3




(['SalePrice', 'OverallQual', 'TotalSF', 'Bath', 'YearRemodAdd', 'MSSubClass', 'HasGarage', 'Fireplaces', 'Bath', 'OutsideArea', 'Modern'], 0.8716876221319576)
shuffle: 4




shuffle: 5


The result is:
(['SalePrice', 'OverallQual', 'TotalSF', 'Bath', 'YearRemodAdd', 'MSSubClass', 'HasGarage', 'Fireplaces', 'Bath', 'OutsideArea', 'Modern'], 0.8716876221319576)


In [143]:
my_model, perf_stats = split_run_test(df_cats[result[0]])



train r2: 0.8792513398523967
train rmse: 0.011660733551115772

test r2: 0.8686407727182601
test rmse: 0.011513485651806174



## Observe best result

to-do - dataframe with best result

## Best on Linear regeression 0.9195

train: r2 0.8917100533649739

train: root mean squared error: 0.01101263566776913

Test:
test: r2 0.9195194361430203

test: root mean squared error: 0.009303583596568663


['SalePrice', 'OverallQual', 'TotalSF', 'GarageArea', 'LotArea', 'Bath', 'Fireplaces', 'YearRemodAdd', 'HasGarage', 'YearBuilt', 'OverallCond', 'GrLivArea', 'BedroomAbvGr', 'FullBath', 'YrSold', 'New', 'Has2Floors', '2ndFlrSF', 'WoodDeckSF', 'GarageYrBlt', 'HasVnr', 'LotArea'] 
0.9195194361430203)

new_params = baths, new, hasgarage, haspool, has2floors, hasvnr, hasbsmt

log of 'LotFrontage', 'LotArea', 'GrLivArea', 'SalePrice'


drop porches 

drop basements 

drop garage 

drop baths

Drop all cats.

'LowQualFinSF', 'PoolArea', 'MiscVal', 'TotRmsAbvGrd', '1stFlrSF'])  drop misc

iqr
['SalePrice', 'LotFrontage', 'LotArea', 'MasVnrArea', 'TotalBsmtSF', 'GrLivArea', 'BedroomAbvGr', 'KitchenAbvGr', 'Fireplaces', 'GarageArea', 'PorchSF']


 no factorize

## Another one
['SalePrice', 'OverallQual', 'GrLivArea', 'TotalBsmtSF', 'LotArea', 'YearBuilt', 'OverallCond', 'GarageArea', 'BsmtFullBath', 'Fireplaces', 'BedroomAbvGr', 'YearRemodAdd', 'HalfBath']

log of ['LotFrontage', 'LotArea', 'GrLivArea', 'SalePrice','GarageArea']


(['SalePrice', 'OverallQual', 'GrLivArea', 'TotalBsmtSF', 'OverallCond', 'YearBuilt', 'LotArea', 'GarageArea', 'Fireplaces', 'HalfBath', 'BsmtFullBath', 'BedroomAbvGr', 'GarageYrBlt', 'WoodDeckSF', 'WoodDeckSF', 'BsmtHalfBath'], 0.9218701457673271)


## LassoLarsCV 0.92

(['SalePrice',
  'OverallQual',
  'TotalSF',
  'GarageArea',
  'LotArea',
  'Foundation',
  'Fireplaces',
  'OverallCond',
  'HalfBath',
  'FullBath',
  'CentralAir',
  'BsmtExposure',
  'BsmtFullBath',
  'PavedDrive',
  'MasVnrType',
  'Functional',
  'BedroomAbvGr',
  'GrLivArea',
  '2ndFlrSF',
  'Condition1',
  'ExterCond'],
 0.9236066685585789) - basic log['LotFrontage', 'LotArea', 'GrLivArea', 'SalePrice']. LassoLarsCV

## Stacking .92 = .92

  estimators = [('LinReg',LinearRegression()), ('Lasso',LassoLarsCV(max_iter=15,eps=0.01)), ('Tree',DecisionTreeRegressor(max_depth=5)),
                ('LGBR', LGBMRegressor(max_depth = 5, learning_rate = 0.05))]
final = RandomForestRegressor(max_depth=6)

train: r2 0.9259061491493182
train: root mean squared error: 0.009157887674229528

Test:
test: r2 0.9217500638734336
test: root mean squared error: 0.009142663976721768

log = ['LotFrontage', 'LotArea', 'GrLivArea', 'SalePrice']

cols = ['SalePrice', 'OverallQual', 'TotalSF', 'MSSubClass', 'GarageArea', 'BsmtHalfBath', 'MSSubClass', 'Fireplaces', 'LotArea', 'OverallCond', 'HalfBath', 'YearRemodAdd', 'OverallCond', 'HalfBath', 'FullBath', 'YearBuilt', 'BsmtFullBath', 'GrLivArea', 'GarageYrBlt', '2ndFlrSF', 'OutsideArea', 'YrSold', 'YrSold', 'PorchSF', 'WoodDeckSF', 'PorchSF', 'MasVnrArea', 'TotalBsmtSF']

factorized?

Best so far - 0.93 stacking of linear regression with factorized cats.

# To-Do

Create a func that pushes conditions, parameters, and results of every model. logging


# Work with test data

## Prepare test data to fit the model

In [144]:

test_data = make_cats(drop_n_log(df_test.copy(), True))[my_model.feature_names_in_]
for col in test_data.columns[test_data.isna().any()].tolist():      #['TotalSF', 'GarageArea','BsmtFullBath']: 
  test_data[col] = test_data[col].fillna(test_data[col].mode()[0]) 

(1459, 73)
(1459, 30)


## Predict values

In [145]:
pred_ys = my_model.predict(test_data)

## Construct dataset for submission

In [146]:
#create_dataset
ids = np.arange(1461, 2920)
my_result = pd.DataFrame({'Id': ids, 'SalePrice': np.e**pred_ys})
my_result.head()

Unnamed: 0,Id,SalePrice
0,1461,113870.639463
1,1462,153593.860985
2,1463,159276.430275
3,1464,163412.186498
4,1465,184170.400079


## Download dataset

In [147]:
my_result.to_csv('submission.csv', index=False)
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#Work with FTP

In [148]:
def connect(address='0.0.0.0', name = 'anonymous', pas = ''):
	ftp = ftplib.FTP(address)
	ftp.login(name, pas)
	return ftp

def ret(ftp, filename = 'test.csv'):
	file = io.BytesIO()
	ftp.retrbinary('RETR '+filename, file.write)
	file.seek(0)
	df = pd.read_csv(file)
	return df

def write(df, ftp, filename = 'test.csv'):
  file = io.BytesIO()
  df.to_csv(file, index = False)
  data = file.getvalue()
  ftp.storbinary('STOR '+filename, io.BytesIO(data))
  ftp.quit()



###Prepare dataset to upload. df in df

In [149]:
stats = "LinearRegression()" #[('LinReg',LinearRegression()), ('Lasso',LassoLarsCV(max_iter=11,eps=0.01)), ('Tree',DecisionTreeRegressor(max_depth=5)), SGDRegressor())]"
dataset = df_cats[result[0]].copy()
predictions = my_result.copy()
time = datetime.now().strftime("%d/%m/%Y %H:%M:%S")


stats_df = pd.DataFrame({'date':[time], 'daset':[predictions.to_dict(orient='records')],
                         'input dataset': [dataset.to_dict(orient='records')], 
                         'model':[stats], 'r2_rmse': [perf_stats]})


#dfn = pd.json_normalize(eval(str(stats_df.iloc[-1]['daset'])))
#dfn.set_index('Id', inplace=True)


  'input dataset': [dataset.to_dict(orient='records')],


Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,113870.639463
1462,153593.860985
1463,159276.430275
1464,163412.186498
1465,184170.400079
...,...
2915,78784.829450
2916,88312.925878
2917,161866.712092
2918,113492.711669
