# **Data Science and OOP Project**

**Data Preprocessing Module of Data Science**



*   Importing necessary libraries





In [25]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler



*   Data Preprocessing Class having some data preprocessing techniques



In [26]:
class DataPreprocessing:
    def __init__(self, file_path, file_format):
        """
        Initializes the DataPreprocessing object with a file path and format.

        Parameters:
        file_path (str): The path to the file to be loaded.
        file_format (str): The format of the file to be loaded. Supported formats are 'csv', 'excel', and 'json'.
        """
        if file_format == 'csv':
            self.df = pd.read_csv(file_path)
        elif file_format == 'excel':
            self.df = pd.read_excel(file_path)
        elif file_format == 'json':
            self.df = pd.read_json(file_path)
        else:
            raise ValueError("Unsupported file format. Please choose 'csv', 'excel', or 'json'.")

    def drop_duplicates(self):
        """
        Drops duplicate rows from the DataFrame.
        """
        self.df.drop_duplicates(inplace=True)

    def fill_missing_values(self):
        """
        Fills missing values in the DataFrame with the mean of the column.
        """
        imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
        self.df = pd.DataFrame(imputer.fit_transform(self.df), columns=self.df.columns)

    def scale_data(self):
        """
        Scales the data in the DataFrame to have zero mean and unit variance.
        """
        scaler = StandardScaler()
        self.df = pd.DataFrame(scaler.fit_transform(self.df), columns=self.df.columns)

    def encode_categorical(self, column):
        """
        One-hot encodes a specified categorical column in the DataFrame.

        Parameters:
        column (str): The name of the column to be one-hot encoded.
        """
        one_hot_encoded = pd.get_dummies(self.df[column], prefix=column)
        self.df = pd.concat([self.df, one_hot_encoded], axis=1)
        self.df.drop(column, axis=1, inplace=True)

    def select_features(self, n):
        """
        Selects the top n features in the DataFrame using the f_classif method.

        Parameters:
        n (int): The number of top features to select.
        """
        from sklearn.feature_selection import SelectKBest, f_classif
        selector = SelectKBest(f_classif, k=n)
        X_new = selector.fit_transform(self.df, y)
        self.df = pd.DataFrame(X_new, columns=self.df.columns[selector.get_support()])

    def view_head(self, n=5):
        """
        Returns the first n rows of the DataFrame.

        Parameters:
        n (int): The number of rows to return. Default is 5.
        """
        return self.df.head(n)

    def view_tail(self, n=5):
        """
        Returns the last n rows of the DataFrame.

        Parameters:
        n (int): The number of rows to return. Default is 5.
        """
        return self.df.tail(n)

    def view_max(self):
        """
        Returns the maximum value for each column in the DataFrame.
        """
        return self.df.max()

    def view_min(self):
        """
        Returns the minimum value for each column in the DataFrame.
        """
        return self.df.min()

    def view_info(self):
        """
        Returns information about the DataFrame, including the number of rows and columns, column data types, and memory usage.
        """
        return self.df.info()

    def view_describe(self):
        """
        Returns statistical information about the DataFrame, including the count, mean, standard deviation, minimum, and maximum values for each column.
        """
        return self.df.describe()

    def isna(self):
        """
        Returns a DataFrame indicating which values are missing (NaN) in the input DataFrame.

        Parameters:
        df (pandas DataFrame): The DataFrame to be checked for missing values.

        Returns:
        pandas DataFrame: A DataFrame indicating which values are missing.
        """
        return self.df.isna()

    def isnull(self):
        """
        Returns a DataFrame indicating which values are missing (NaN or None) in the input DataFrame.

        Parameters:
        df (pandas DataFrame): The DataFrame to be checked for missing values.

        Returns:
        pandas DataFrame: A DataFrame indicating which values are missing.
        """
        return self.df.isnull()
    
    def drop_columns(self, df, columns):
        """
        Drops the specified columns from the DataFrame.

        Parameters:
        df (pandas DataFrame): The DataFrame to be modified.
        columns (list of str): The names of the columns to be dropped.

        Returns:
        pandas DataFrame: The modified DataFrame.
        """
        df = df.drop(columns=columns)
        return df



*   Testing on sample dataset



In [27]:
data = DataPreprocessing('https://raw.githubusercontent.com/AdiPersonalWorks/Random/master/student_scores%20-%20student_scores.csv','csv')
print(data.view_head())

   Hours  Scores
0    2.5      21
1    5.1      47
2    3.2      27
3    8.5      75
4    3.5      30


In [28]:
print(data.view_describe())

           Hours     Scores
count  25.000000  25.000000
mean    5.012000  51.480000
std     2.525094  25.286887
min     1.100000  17.000000
25%     2.700000  30.000000
50%     4.800000  47.000000
75%     7.400000  75.000000
max     9.200000  95.000000


In [29]:
print(data.view_min())

Hours      1.1
Scores    17.0
dtype: float64


In [30]:
data.isnull()

Unnamed: 0,Hours,Scores
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
5,False,False
6,False,False
7,False,False
8,False,False
9,False,False
