In [6]:
import numpy as np
import pandas as pd


class Palanthir(object):

    def __init__(self, input):
        """Initiates a Palanthir-class on-top a Pandas Dataframe. The class-attributes describes the overall structure and composition of the data"""
        self.input_data = input
        self.output = self.input_data.copy(deep=True)
        self.observations = len(self.output)
        self.features = list(self.output)
        self.features_num = list(self.output.loc[:, self.output.dtypes != object])
        self.features_cat = list(self.output.loc[:, self.output.dtypes == object])
        self.train_subset = []
        self.test_subset = []
        self.transformation_history = []

    def update_attributes(self, step=None):
        self.observations = len(self.output)
        self.features = list(self.output)
        self.features_num = list(self.output.loc[:, self.output.dtypes != object])
        self.features_cat = list(self.output.loc[:, self.output.dtypes == object])
        self.transformation_history.append(step)

    def summarize(self):
        """Prints the info, description and any missing value-counts for the class"""
        dataset = self.output
        return print(
            "Info: ", dataset.info(),
            "Description: ", dataset.describe(),
            "Missing values: ", dataset.isna().sum()
        )

    def random_split(self, test_size, store=True):
        """Uses the SKLearn Train_Test_Split to divide the dataset into random training and test subset"""
        dataset = self.output
        from sklearn.model_selection import train_test_split
        train, test = train_test_split(dataset, test_size=test_size, random_state=42)
        if store:
            self.train_subset, self.test_subset = [train], [test]
        return train, test

    def stratified_split(self, cols, store=True):
        """Uses the SKLearn StratigiesShuffleSplit to divide the dataset into stratified training and test subset"""
        dataset = self.output
        from sklearn.model_selection import StratifiedShuffleSplit
        split = StratifiedShuffleSplit(n_split=1, test_size=0.2, random_state=42)
        for train_index, test_index in split.split(dataset, dataset[cols]):
            strat_train_set = dataset.loc[train_index]
            strat_test_set = dataset.loc[test_index]
        if store:
            self.train_subset, self.test_subset = [strat_train_set], [strat_test_set]
        return strat_train_set, strat_test_set

    def PCA(self, n_components=0.80, store=True):
        dataset = self.output[self.features_num]
        from sklearn.decomposition import PCA
        pca_data = PCA(n_components=n_components).fit_transform(dataset)
        output_df = pd.DataFrame(pca_data, columns=["PCA_" + str(col + 1) for col in range(pca_data.shape[1])],
                                 index=dataset.index)
        if store:
            self.output = output_df
            self.update_attributes(step="Performed Principal Component Analysis")
        explained_variance = PCA().fit(dataset).explained_variance_ratio_
        cumsum = np.cumsum(explained_variance)
        print(cumsum)
        plt.plot(["PCA" + str(num) for num in range(1, len(cumsum) + 1)], cumsum)
        plt.show()
        return output_df

    def fill_nulls(self, strategy="median", store=True):
        """Uses the SKLearn SimpleImputer to fill out any missing values in the numerical features of the dataset"""
        dataset = self.output[self.features_num]
        from sklearn.impute import SimpleImputer
        imputed_data = SimpleImputer(strategy=strategy).fit_transform(dataset)
        output_df = pd.DataFrame(imputed_data, columns=dataset.columns, index=dataset.index)
        if store:
            self.output[self.features_num] = output_df
            self.update_attributes(step="Filled nulls")
        return output_df

    def encode_order(self, store=True):
        """Uses the SKLearn OrdinalEncoder to order any categorical features of the dataset"""
        dataset = self.output[self.features_cat]
        from sklearn.preprocessing import OrdinalEncoder
        encoded_data = OrdinalEncoder().fit_transform(dataset)
        output_df = pd.DataFrame(encoded_data, columns=dataset.columns, index=dataset.index)
        if store:
            self.output[self.features_cat] = output_df
            self.update_attributes(step="Encoded order of categorial features")
        return output_df

    def make_dummies(self, store=True):
        """Uses the SKLearn OneHotEncoder to turn categorical features of the dataset into dummy-variables"""
        dataset = self.output[self.features_cat]
        from sklearn.preprocessing import OneHotEncoder
        encoder = OneHotEncoder().fit(dataset)
        new_column_names = encoder.get_feature_names(dataset.columns)
        dummy_data = encoder.transform(dataset).toarray()
        dummy_data_df = pd.DataFrame(dummy_data, columns=[name for name in new_column_names], index=dataset.index)
        output_df = pd.merge(self.output[self.features_num], dummy_data_df, left_index=True, right_index=True)
        if store == True:
            self.output = output_df
            self.update_attributes(step="Turned categorical features into dummy variables")
        return output_df

    def scale(self, store=True):
        """Uses the SKLearn StandardScaler to scall all numerical features of the dataset"""
        dataset = self.output[self.features_num]
        from sklearn.preprocessing import StandardScaler
        output_df = StandardScaler().fit_transform(dataset)
        if store:
            self.output[self.features_num] = output_df
            self.update_attributes(step="Scaled feature-values")
        return output_df

    def cluster(self, max_k=10, store=True):
        """Uses the SKLearn KMeans to cluster the dataset"""
        dataset = self.output
        from sklearn.cluster import KMeans
        from matplotlib import pyplot
        from sklearn.metrics import silhouette_score
        kmeans_per_k = [KMeans(n_clusters=k, random_state=42).fit(dataset) for k in range(1, max_k + 1)]
        silhouettes = [silhouette_score(dataset, model.labels_) for model in kmeans_per_k[1:]]
        best_k = silhouettes.index(max(silhouettes)) + 2
        plt.plot(range(2, max_k + 1), silhouettes)
        plt.xlabel("KMeans")
        plt.ylabel("Silhouette-score")
        plt.show()
        print("Best silhouette is obtained with k as: ", best_k)
        if store:
            self.output["Cluster"] = ["Cluster " + str(i) for i in
                                      KMeans(n_clusters=best_k, random_state=42).fit_predict(dataset)]
            self.update_attributes(step="Added Cluster-label as column to dataset")
        return self.output

    def construct_pipeline(self):
        """Uses the SKLearn Pipeline and ColumnTransformer to construct a pipeline of transformations on the dataset - including filling out zeroes, scaling and dummifying"""
        dataset = self.output
        from sklearn.pipeline import Pipeline
        from sklearn.compose import ColumnTransformer
        num_columns = []
        cat_columns = []
        num_pipeline = Pipeline([
            ("imputer", self.fill_nulls()),
            ("scaler", self.scale()),
        ])
        cat_pipeline = Pipeline([
            ("dummy", self.make_dummies()),
        ])
        full_pipeline = ColumnTransformer([
            ("num", num_pipeline, num_columns),
            ("cat", cat_pipeline, cat_columns),
        ])
        self.output = full_pipeline.fit_transform(dataset)
        return self.output

    def cross_validate(self, model, x, y, score_measure="neg_mean_squared_error", folds=10):
        """Uses the SKLearn Cross_Val_Score to cross-validate one/several models on the training subset"""
        from sklearn.model_selection import cross_val_score
        scores = cross_val_score(model, x, y, scoring=score_measure, cv=folds)
        return scores

    def full_analysis(self, model):
        """Conducts a full data-analysis pipeline on the dataset, including model training, evaluation and tuning"""
        dataset = self.output
        X_train, X_test, Y_train, Y_test = self.random_split(dataset)

        sqrt_scores = np.sqrt(
            -self.cross_validate(model, X_train, Y_train, score_measure="neg_mean_squared_error", folds=10))
        print(
            "RMSE-scores: ", sqrt_scores,
            "RMSE-mean: ", sqrt_scores.mean(),
            "RMSE-std: ", sqrt_scores.std()
        )

In [7]:
dir = 'C:/Users/JesperFrederiksen/PycharmProjects/ML-code/datasets/housing/housing.csv'
df = pd.read_csv(dir)

In [8]:
pal = Palanthir(df)

In [9]:
pal.features

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'median_house_value',
 'ocean_proximity']

In [10]:
pal.transformation_history

[]

In [11]:
pal.features_cat

['ocean_proximity']

In [12]:
pal.scale()
pal.features_num.

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'median_house_value']

In [15]:
pal.output.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-1.327835,1.052548,0.982143,-0.804819,-0.970325,-0.974429,-0.977033,2.344766,2.129631,NEAR BAY
1,-1.322844,1.043185,-0.607019,2.04589,1.348276,0.861439,1.669961,2.332238,1.314156,NEAR BAY
2,-1.332827,1.038503,1.856182,-0.535746,-0.825561,-0.820777,-0.843637,1.782699,1.258693,NEAR BAY
3,-1.337818,1.038503,1.856182,-0.624215,-0.718768,-0.766028,-0.733781,0.932968,1.1651,NEAR BAY
4,-1.337818,1.038503,1.856182,-0.462404,-0.611974,-0.759847,-0.629157,-0.012881,1.1729,NEAR BAY


In [16]:
pal.encode_order()

NameError: name 'en_data_df' is not defined

In [77]:
df2

0        NEAR BAY
1        NEAR BAY
2        NEAR BAY
3        NEAR BAY
4        NEAR BAY
           ...   
20635      INLAND
20636      INLAND
20637      INLAND
20638      INLAND
20639      INLAND
Name: ocean_proximity, Length: 20640, dtype: object