# LINEAR REGRESSION

## Input and Read

In [1]:
import os
import dask.dataframe as dd


#Takes path of directory and finds train.csv, test.csv or direct path of singular .csv file
#and returns list of dask dataframe [singular_df_x, singular_df_y] or [train_df_x, train_df_y, test_df_x, test_df_y]
class DataframeExtractor_csv:
    def __init__(self, directory_path, label_names = []):
        self._directory_path = directory_path
        self._label_names = label_names
        self._df_list = []
        self.get_df_list()
        
    @property
    def directory_path(self):
        return self._directory_path
    
    @property
    def labels(self):
        return self._labels
    
    @property
    def df_list(self):
        return self._df_list
    
    #returns list of dask dataframes
    def get_df_list(self):
        
        #If csv file path has been entered
        if self._directory_path.endswith(".csv"):
            print(f"Reading single csv from {self._directory_path}")
            csv_df = dd.read_csv(self._directory_path, assume_missing = True, sample_rows=1000)
            self._df_list.append(csv_df.loc[:, ~csv_df.columns.isin(self._label_names)])
            self._df_list.append(csv_df[self._label_names])
            
        elif self._no_of_csv(self._directory_path) == 1:
            csv_dir = self._get_csv_path(self._directory_path)
            self._check_dir_exists(csv_dir)
            csv_df = dd.read_csv(csv_dir, assume_missing = True, sample_rows=1000)
            self._df_list.append(csv_df.loc[:, ~csv_df.columns.isin(self._label_names)])
            self._df_list.append(csv_df[self._label_names])
            
        #Finding train.csv and test.csv from directory
        else:            
            print(f"Reading train.csv and test.csv of directory {self._directory_path}")
            
            train_dir = os.path.join(self._directory_path, "train.csv")
            self._check_dir_exists(train_dir)
            csv_df = dd.read_csv(train_dir, assume_missing = True, sample_rows=1000)
            self._df_list.append(csv_df.loc[:, ~csv_df.columns.isin(self._label_names)])
            self._df_list.append(csv_df[self._label_names])
            
            
            test_dir = os.path.join(self._directory_path, "test.csv")
            self._check_dir_exists(test_dir)
            csv_df = dd.read_csv(test_dir, assume_missing = True, sample_rows=1000)
            self._df_list.append(csv_df.loc[:, ~csv_df.columns.isin(self._label_names)])
            self._df_list.append(csv_df[self._label_names])

        
        #If no datasets are found
        if not self._df_list:
            raise EmptyListError("No Datasets found")
            
        
    @staticmethod
    def _check_dir_exists(directory):
        if(not os.path.isfile(directory)):
            raise FileNotFoundError(f"Directory {directory} does not exist")
            
    @staticmethod
    def _no_of_csv(directory):
        i = 0
        filenames = os.listdir(directory)
        for filename in filenames:
            if filename.endswith(".csv"):
                i = i+1
        return i
    @staticmethod
    def _get_csv_path(directory):
        filenames = os.listdir(directory)
        for filename in filenames:
            if filename.endswith(".csv"):
                return os.path.join(directory,filename)
        

#### Example Use Case

##### For Directory path

In [2]:
#For House Prediction Dataset https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data?select=train.csv
directory1 = "/home/anish/ASC-ML-EXP-DATASETS/LinReg-tabular/house-prices-advanced-regression-techniques/"


# returns list of dask dataframe [singular_df_x, singular_df_y] or [train_df_x, train_df_y, test_df_x, test_df_y]
dataset_list = DataframeExtractor_csv(directory1, label_names = ["MSSubClass", "LotFrontage"]).df_list

print("Train Dataset X Columns")
dataset_list[0].head()

Reading train.csv and test.csv of directory /home/anish/ASC-ML-EXP-DATASETS/LinReg-tabular/house-prices-advanced-regression-techniques/
Train Dataset X Columns


Unnamed: 0,Id,MSZoning,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1.0,RL,8450.0,Pave,,Reg,Lvl,AllPub,Inside,Gtl,...,0.0,,,,0.0,2.0,2008.0,WD,Normal,208500.0
1,2.0,RL,9600.0,Pave,,Reg,Lvl,AllPub,FR2,Gtl,...,0.0,,,,0.0,5.0,2007.0,WD,Normal,181500.0
2,3.0,RL,11250.0,Pave,,IR1,Lvl,AllPub,Inside,Gtl,...,0.0,,,,0.0,9.0,2008.0,WD,Normal,223500.0
3,4.0,RL,9550.0,Pave,,IR1,Lvl,AllPub,Corner,Gtl,...,0.0,,,,0.0,2.0,2006.0,WD,Abnorml,140000.0
4,5.0,RL,14260.0,Pave,,IR1,Lvl,AllPub,FR2,Gtl,...,0.0,,,,0.0,12.0,2008.0,WD,Normal,250000.0


In [3]:
print("Train Dataset Y Columns")
dataset_list[1].head()

Train Dataset Y Columns


Unnamed: 0,MSSubClass,LotFrontage
0,60.0,65.0
1,20.0,80.0
2,60.0,68.0
3,70.0,60.0
4,60.0,84.0


##### For direct csv path

In [4]:
#For Weather Prediction dataset https://www.kaggle.com/budincsevity/szeged-weather
directory2 = "/home/anish/ASC-ML-EXP-DATASETS/LinReg-tabular/weather-history-dataset/weatherHistory.csv"

dataset_list2 = DataframeExtractor_csv(directory2, label_names = ["Summary"]).df_list

print("Dataset X values")
dataset_list2[0].head()

Reading single csv from /home/anish/ASC-ML-EXP-DATASETS/LinReg-tabular/weather-history-dataset/weatherHistory.csv
Dataset X values


Unnamed: 0,Formatted Date,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
0,2006-04-01 00:00:00.000 +0200,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.
1,2006-04-01 01:00:00.000 +0200,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.
2,2006-04-01 02:00:00.000 +0200,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.
3,2006-04-01 03:00:00.000 +0200,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.
4,2006-04-01 04:00:00.000 +0200,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.


In [5]:
print("Dataset Y values")
dataset_list2[1].head()

Dataset Y values


Unnamed: 0,Summary
0,Partly Cloudy
1,Partly Cloudy
2,Mostly Cloudy
3,Partly Cloudy
4,Mostly Cloudy


## Data Cleaning

## Visualization

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

## Feature Engineering

## Generating Models

## Evaluating Models

## Select and Fit Model