<a href="https://colab.research.google.com/github/GDharan10/Data_Preprocessing_Framework/blob/main/Data_Preprocessing_Framework_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Librarys**

In [1]:
import pandas as pd
import numpy as np

#Statistic
from scipy import stats

#Visualization
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns


# **DataFrame**

In [2]:
df = sns.load_dataset('tips')
df.to_csv('tips.csv', index=False)

In [3]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


# **DataFrame observation**


In [4]:
continuous_columns = ["total_bill", "tip"]
category_columns = ["sex",	"smoker",	"day", "time", "size"]

#Unsupervised

# **Data cleaning using pandas**


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [6]:
df.describe()

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [7]:
df.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

# **Hypothesis testing / Statistical analysis**


Continuous column - Central Limit Theorem, 1-Tailed Test (one-sample t-test)

Continuous vs. Continuous - Correlation, 2-Tailed Test (two-sample t-test)

Continuous vs. Categorical - ANOVA (Analysis of Variance)

Categorical vs. Categorical - Chi-Square Test

In [None]:
'''
             Correlation
for all numerical columns

           Central Limit Theorem
total_bill
tip

             1-Tailed Test
total_bill
tip

              2-Tailed Test
total_bill,	tip 	   - H0 is rejected

                ANOVA
total_bill, sex	    - H0 is rejected
total_bill, smoker   - H0 is accepted
total_bill, day     - H0 is rejected
total_bill, time	    - H0 is rejected
total_bill, size    - H0 is rejected

tip, sex,	          - H0 is accepted
tip, smoker          - H0 is accepted
tip, day            - H0 is accepted
tip, time	          - H0 is accepted
tip, size            - H0 is rejected

            Chi-Square Test
sex, smoker          - H0 is accepted
sex, day              - H0 is rejected
sex, time	          - H0 is rejected
sex, size           - H0 is accepted

day, time	        - H0 is rejected
day, size	         - H0 is rejected

time, size	         - H0 is rejected
'''

# Correlation

In [9]:
df.corr()

  df.corr()


Unnamed: 0,total_bill,tip,size
total_bill,1.0,0.675734,0.598315
tip,0.675734,1.0,0.489299
size,0.598315,0.489299,1.0


In [10]:
def hypothesisTesting(continuous_columns = [], category_columns = [], sampling_rate = 0.05):
  #Central Limit Theorem
  oneContinesColumn_result = {}
  comperativeColumn_result = {}
  if continuous_columns:
    samplesize = int(sampling_rate * len(df[continuous_columns[0]]))
    for column in continuous_columns:
      population=df[column].values
      population_mean = population.mean()
      sample_mean=[]
      for i in range(40):
        sample=np.random.choice(population,samplesize)
        sample_mean.append(sample.mean())
      oneContinesColumn_result[column] = {
                                          "Population Mean": population_mean,
                                          "Sample Means": {np.mean(sample_mean)}
                                          }

  #1-Tailed Test
  if continuous_columns:
    samplesize = int(sampling_rate * len(df[continuous_columns[0]]))
    for column in continuous_columns:
      H0_accepted = 0
      H0_rejected = 0
      for i in range(samplesize):
        sample=df[column].sample(frac=0.05)
        t_test,p_value=stats.ttest_1samp(sample,df[column].mean())
        if p_value > 0.5:
          H0_accepted += 1
        else:
          H0_rejected += 1

      if H0_accepted > H0_rejected:
        oneContinesColumn_result[column].update( {
                                            "H0_accepted": H0_accepted,
                                            "H0_rejected": H0_rejected,
                                            "Conclusion": "H0 is accepted, Ha is rejected, There is no significant effect"
                                          } )
      else:
          oneContinesColumn_result[column].update( {
                                              "H0_accepted": H0_accepted,
                                              "H0_rejected": H0_rejected,
                                              "Conclusion": "H0 is rejected, Ha is accepted, There is a significant effect"
                                              } )

  #2-Tailed Test
  if continuous_columns:
    for i in range(len(continuous_columns) - 1):
      column_1 = continuous_columns[i]
      for column_2 in continuous_columns[i+1:]:
        H0_accepted = 0
        H0_rejected = 0
        for i in range(20):
          sample1 = df[column_1].sample(frac=0.04)
          sample2 = df[column_2].sample(frac=0.04)
          t_test, p_value = stats.ttest_ind(sample1, sample2)
          if p_value > 0.5:
            H0_accepted += 1
          else:
            H0_rejected += 1

        if H0_accepted > H0_rejected:
          comperativeColumn_result[column_1 +' & '+ column_2] = {
                  "H0_accepted": H0_accepted,
                  "H0_rejected": H0_rejected,
                  "Conclusion": "H0 is accepted, Ha is rejected, There is no significant effect"
              }
        else:
            comperativeColumn_result[column_1 +' & '+ column_2] = {
                "H0_accepted": H0_accepted,
                "H0_rejected": H0_rejected,
                "Conclusion": "H0 is rejected, Ha is accepted, There is a significant effect"
            }

  #Chi-Square Test
  if category_columns:
    for i in range(len(category_columns) - 1):
      column_1 = category_columns[i]
      for column_2 in category_columns[i+1:]:
        data = pd.crosstab(df[column_1], df[column_2])
        observed_values = data.values
        chi2_stat, p_value, _, _= stats.chi2_contingency(observed_values)

        if p_value > 0.05:
          comperativeColumn_result[column_1 +" & "+ column_2] = "H0 is accepted, There is no relationship between two columns we're comparing"
        else:
          comperativeColumn_result[column_1 +" & "+ column_2] = "H0 is rejected, There is a relationship between two columns we're comparing"

  #ANOVA
  if continuous_columns and category_columns:
    for category_column in category_columns:
      for continuous_column in continuous_columns:
        group = df[category_column].unique()
        data = {}
        for i in group:
          data[i]=df[continuous_column][df[category_column]==i]

        f_value, p_value = stats.f_oneway(*[data[i] for i in group])
        if p_value > 0.05:
          comperativeColumn_result[category_column +" & "+ continuous_column] = "H0 is accepted, There is a relationship between two columns we're comparing"
        else:
          comperativeColumn_result[category_column +" & "+ continuous_column] = "H0 is rejected, There is no relationship between two columns we're comparing"
  return oneContinesColumn_result, comperativeColumn_result



In [11]:
ContinesColumn_result, comperativeColumn_result = hypothesisTesting(continuous_columns , category_columns)

# **Result Hypothesis**

In [12]:
ContinesColumn_result

{'total_bill': {'Population Mean': 19.78594262295082,
  'Sample Means': {19.948666666666664},
  'H0_accepted': 7,
  'H0_rejected': 5,
  'Conclusion': 'H0 is accepted, Ha is rejected, There is no significant effect'},
 'tip': {'Population Mean': 2.99827868852459,
  'Sample Means': {2.9641666666666664},
  'H0_accepted': 5,
  'H0_rejected': 7,
  'Conclusion': 'H0 is rejected, Ha is accepted, There is a significant effect'}}

In [13]:
ContinesColumn_result_df = pd.DataFrame(ContinesColumn_result)

In [14]:
ContinesColumn_result_df

Unnamed: 0,total_bill,tip
Population Mean,19.785943,2.998279
Sample Means,{19.948666666666664},{2.9641666666666664}
H0_accepted,7,5
H0_rejected,5,7
Conclusion,"H0 is accepted, Ha is rejected, There is no si...","H0 is rejected, Ha is accepted, There is a sig..."


In [15]:
comperativeColumn_result

{'total_bill & tip': {'H0_accepted': 0,
  'H0_rejected': 20,
  'Conclusion': 'H0 is rejected, Ha is accepted, There is a significant effect'},
 'sex & smoker': "H0 is accepted, There is no relationship between two columns we're comparing",
 'sex & day': "H0 is rejected, There is a relationship between two columns we're comparing",
 'sex & time': "H0 is rejected, There is a relationship between two columns we're comparing",
 'sex & size': "H0 is accepted, There is no relationship between two columns we're comparing",
 'smoker & day': "H0 is rejected, There is a relationship between two columns we're comparing",
 'smoker & time': "H0 is accepted, There is no relationship between two columns we're comparing",
 'smoker & size': "H0 is accepted, There is no relationship between two columns we're comparing",
 'day & time': "H0 is rejected, There is a relationship between two columns we're comparing",
 'day & size': "H0 is rejected, There is a relationship between two columns we're comparing"

In [16]:
comperativeColumn_result_df = pd.DataFrame(comperativeColumn_result)

In [17]:
print(comperativeColumn_result_df.columns)

Index(['total_bill & tip', 'sex & smoker', 'sex & day', 'sex & time',
       'sex & size', 'smoker & day', 'smoker & time', 'smoker & size',
       'day & time', 'day & size', 'time & size', 'sex & total_bill',
       'sex & tip', 'smoker & total_bill', 'smoker & tip', 'day & total_bill',
       'day & tip', 'time & total_bill', 'time & tip', 'size & total_bill',
       'size & tip'],
      dtype='object')


In [18]:
comperativeColumn_result_df

Unnamed: 0,total_bill & tip,sex & smoker,sex & day,sex & time,sex & size,smoker & day,smoker & time,smoker & size,day & time,day & size,...,sex & total_bill,sex & tip,smoker & total_bill,smoker & tip,day & total_bill,day & tip,time & total_bill,time & tip,size & total_bill,size & tip
Conclusion,"H0 is rejected, Ha is accepted, There is a sig...","H0 is accepted, There is no relationship betwe...","H0 is rejected, There is a relationship betwee...","H0 is rejected, There is a relationship betwee...","H0 is accepted, There is no relationship betwe...","H0 is rejected, There is a relationship betwee...","H0 is accepted, There is no relationship betwe...","H0 is accepted, There is no relationship betwe...","H0 is rejected, There is a relationship betwee...","H0 is rejected, There is a relationship betwee...",...,"H0 is rejected, There is no relationship betwe...","H0 is accepted, There is a relationship betwee...","H0 is accepted, There is a relationship betwee...","H0 is accepted, There is a relationship betwee...","H0 is rejected, There is no relationship betwe...","H0 is accepted, There is a relationship betwee...","H0 is rejected, There is no relationship betwe...","H0 is accepted, There is a relationship betwee...","H0 is rejected, There is no relationship betwe...","H0 is rejected, There is no relationship betwe..."
H0_accepted,0,"H0 is accepted, There is no relationship betwe...","H0 is rejected, There is a relationship betwee...","H0 is rejected, There is a relationship betwee...","H0 is accepted, There is no relationship betwe...","H0 is rejected, There is a relationship betwee...","H0 is accepted, There is no relationship betwe...","H0 is accepted, There is no relationship betwe...","H0 is rejected, There is a relationship betwee...","H0 is rejected, There is a relationship betwee...",...,"H0 is rejected, There is no relationship betwe...","H0 is accepted, There is a relationship betwee...","H0 is accepted, There is a relationship betwee...","H0 is accepted, There is a relationship betwee...","H0 is rejected, There is no relationship betwe...","H0 is accepted, There is a relationship betwee...","H0 is rejected, There is no relationship betwe...","H0 is accepted, There is a relationship betwee...","H0 is rejected, There is no relationship betwe...","H0 is rejected, There is no relationship betwe..."
H0_rejected,20,"H0 is accepted, There is no relationship betwe...","H0 is rejected, There is a relationship betwee...","H0 is rejected, There is a relationship betwee...","H0 is accepted, There is no relationship betwe...","H0 is rejected, There is a relationship betwee...","H0 is accepted, There is no relationship betwe...","H0 is accepted, There is no relationship betwe...","H0 is rejected, There is a relationship betwee...","H0 is rejected, There is a relationship betwee...",...,"H0 is rejected, There is no relationship betwe...","H0 is accepted, There is a relationship betwee...","H0 is accepted, There is a relationship betwee...","H0 is accepted, There is a relationship betwee...","H0 is rejected, There is no relationship betwe...","H0 is accepted, There is a relationship betwee...","H0 is rejected, There is no relationship betwe...","H0 is accepted, There is a relationship betwee...","H0 is rejected, There is no relationship betwe...","H0 is rejected, There is no relationship betwe..."


# **Data Visualization**

In [19]:
#total_bill,	tip
fig = px.scatter(data_frame=df, x="total_bill", y="tip", color="sex", trendline="ols",
                 title="Scatter Plot")
fig.show()


# **Preprocessing**

In [20]:
df.time.unique()

['Dinner', 'Lunch']
Categories (2, object): ['Lunch', 'Dinner']

In [21]:
# Replacing categorical values with numerical equivalents
df['sex'] = df['sex'].map({'Female': 101, 'Male': 102})
df['smoker'] = df['smoker'].map({'No': 201, 'Yes': 202})
df['day'] = df['day'].replace({'Sun': 301, 'Thur': 302, 'Fri': 303, 'Sat' : 304})
df['time'] = df['time'].replace({'Dinner': 401, 'Lunch': 402})
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,101,201,301,401,2
1,10.34,1.66,102,201,301,401,3
2,21.01,3.50,102,201,301,401,3
3,23.68,3.31,102,201,301,401,2
4,24.59,3.61,101,201,301,401,4
...,...,...,...,...,...,...,...
239,29.03,5.92,102,201,304,401,3
240,27.18,2.00,101,202,304,401,2
241,22.67,2.00,102,202,304,401,2
242,17.82,1.75,102,201,304,401,2


# **Machine Learning**

In [22]:
# 1 data availability
# 2 separating independent and dependent
# 3 identifying algorithms/Model
# 4 training
# 5 evaluation