# LINEAR REGRESSION

## Input and Read

In [5]:
import os
import dask.dataframe as dd


#Takes path of directory and finds train.csv, test.csv or direct path of singular .csv file
#and returns list of dask dataframe [singular_df] or [train_df,test_df]
class DataframeExtractor_csv:
    def __init__(self, directory_path):
        self._directory_path = directory_path
        
    @property
    def directory_path(self):
        return self._directory_path
    
    #returns list of dask dataframe
    def return_dask_df(self):
        df_list = []
        
        #If csv file path has been entered
        if self._directory_path.endswith(".csv"):
            print("Reading single csv file path")
            df_list.append(dd.read_csv(self._directory_path, assume_missing = True, sample_rows=1000))
            
        #Finding train.csv and test.csv from directory
        else:
            print("Reading train.csv and test.csv of directory")
            train_dir = os.path.join(self._directory_path, "train.csv")
            self._check_dir_exists(train_dir)
            
            test_dir = os.path.join(self._directory_path, "test.csv")
            self._check_dir_exists(test_dir)
    
            #train_df = dd.read_csv(train_dir)
            #test_df = dd.read_csv(test_dir)
            df_list.append(dd.read_csv(train_dir, assume_missing = True, sample_rows=1000))
            df_list.append(dd.read_csv(test_dir, assume_missing = True, sample_rows=1000))
        
        #If no datasets are found
        if not df_list:
            raise EmptyListError("No Datasets found")
            
        return df_list
        
    @staticmethod
    def _check_dir_exists(directory):
        if(not os.path.isfile(directory)):
            raise FileNotFoundError(f"Directory {directory} does not exist")
        

#### Example Use Case

##### For Directory path

In [6]:
#For House Prediction Dataset https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data?select=train.csv
directory1 = "/home/anish/ASC-ML-EXP-DATASETS/LinReg-tabular/house-prices-advanced-regression-techniques/"

data_reader1 = DataframeExtractor_csv(directory1)
dataset_list = data_reader1.return_dask_df()

print("Train Dataset")
dataset_list[0].head()

Reading train.csv and test.csv of directory
Train Dataset


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1.0,60.0,RL,65.0,8450.0,Pave,,Reg,Lvl,AllPub,...,0.0,,,,0.0,2.0,2008.0,WD,Normal,208500.0
1,2.0,20.0,RL,80.0,9600.0,Pave,,Reg,Lvl,AllPub,...,0.0,,,,0.0,5.0,2007.0,WD,Normal,181500.0
2,3.0,60.0,RL,68.0,11250.0,Pave,,IR1,Lvl,AllPub,...,0.0,,,,0.0,9.0,2008.0,WD,Normal,223500.0
3,4.0,70.0,RL,60.0,9550.0,Pave,,IR1,Lvl,AllPub,...,0.0,,,,0.0,2.0,2006.0,WD,Abnorml,140000.0
4,5.0,60.0,RL,84.0,14260.0,Pave,,IR1,Lvl,AllPub,...,0.0,,,,0.0,12.0,2008.0,WD,Normal,250000.0


In [7]:
print("Test Dataset")
dataset_list[1].head()

Test Dataset


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461.0,20.0,RH,80.0,11622.0,Pave,,Reg,Lvl,AllPub,...,120.0,0.0,,MnPrv,,0.0,6.0,2010.0,WD,Normal
1,1462.0,20.0,RL,81.0,14267.0,Pave,,IR1,Lvl,AllPub,...,0.0,0.0,,,Gar2,12500.0,6.0,2010.0,WD,Normal
2,1463.0,60.0,RL,74.0,13830.0,Pave,,IR1,Lvl,AllPub,...,0.0,0.0,,MnPrv,,0.0,3.0,2010.0,WD,Normal
3,1464.0,60.0,RL,78.0,9978.0,Pave,,IR1,Lvl,AllPub,...,0.0,0.0,,,,0.0,6.0,2010.0,WD,Normal
4,1465.0,120.0,RL,43.0,5005.0,Pave,,IR1,HLS,AllPub,...,144.0,0.0,,,,0.0,1.0,2010.0,WD,Normal


##### For direct csv path

In [8]:
#For Weather Prediction dataset https://www.kaggle.com/budincsevity/szeged-weather
directory = "/home/anish/ASC-ML-EXP-DATASETS/LinReg-tabular/weather-history-dataset/weatherHistory.csv"
data_reader2 = DataframeExtractor_csv(directory)
dataset_list2 = data_reader2.return_dask_df()

print("Dataset")
dataset_list2[0].head()

Reading single csv file path
Dataset


Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
0,2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.
1,2006-04-01 01:00:00.000 +0200,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.
2,2006-04-01 02:00:00.000 +0200,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.
3,2006-04-01 03:00:00.000 +0200,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.
4,2006-04-01 04:00:00.000 +0200,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.


## Data Cleaning

## Visualization

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

## Feature Engineering

## Generating Models

## Evaluating Models

## Select and Fit Model