<a href="https://colab.research.google.com/github/GDharan10/Data-Preprocessing-Framework/blob/main/Data_Preprocessing_Framework_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Librarys**

In [1]:
from pprint import pprint
import pandas as pd
import numpy as np

#Statistic
from scipy import stats

#Machine Learning
import statsmodels.api as sm

#Visualization
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, StandardScaler


# **DataFrame**

In [2]:
df = sns.load_dataset('tips')
df.to_csv('tips.csv', index=False)

In [3]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


# **DataFrame observation**


Supervised [Classification / Regression] (or) Unsupervised [Clustering / Association]

Dependent column

Independent column

Continuous column

Category column

In [4]:
# Dependent -
# Independent -

continuous_columns = ["total_bill", "tip"]
category_columns = ["sex",	"smoker",	"day", "time", "size"]

#Unsupervised

# **Data cleaning using pandas**


Handling Null Values

Handling Duplicates

Data Type Conversion

Standardizing or Normalizing Data

Handling Text Data

Handling Date and Time Data

Handling Outliers

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [6]:
df.describe()

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [7]:
df.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

# **Hypothesis testing / Statistical analysis**


Continuous - Central Limit Theorem, 1-Tailed Test (one-sample t-test)

Continuous vs. Continuous - Correlation, 2-Tailed Test (two-sample t-test)

Continuous vs. Categorical - ANOVA (Analysis of Variance)

Categorical vs. Categorical - Chi-Square Test

In [8]:
'''
             Correlation
for all numerical columns

           Central Limit Theorem
total_bill
tip

             1-Tailed Test
total_bill
tip

              2-Tailed Test
total_bill,	tip 	   - H0 is rejected

                ANOVA
total_bill, sex	    - H0 is rejected
total_bill, smoker   - H0 is accepted
total_bill, day     - H0 is rejected
total_bill, time	    - H0 is rejected
total_bill, size    - H0 is rejected

tip, sex,	          - H0 is accepted
tip, smoker          - H0 is accepted
tip, day            - H0 is accepted
tip, time	          - H0 is accepted
tip, size            - H0 is rejected

            Chi-Square Test
sex, smoker          - H0 is accepted
sex, day              - H0 is rejected
sex, time	          - H0 is rejected
sex, size           - H0 is accepted

day, time	        - H0 is rejected
day, size	         - H0 is rejected

time, size	         - H0 is rejected
'''

'\n             Correlation\nfor all numerical columns\n\n           Central Limit Theorem\ntotal_bill\ntip\n\n             1-Tailed Test\ntotal_bill\ntip\n\n              2-Tailed Test\ntotal_bill,\ttip \t   - H0 is rejected\n\n                ANOVA\ntotal_bill, sex\t    - H0 is rejected\ntotal_bill, smoker   - H0 is accepted\ntotal_bill, day     - H0 is rejected\ntotal_bill, time\t    - H0 is rejected\ntotal_bill, size    - H0 is rejected\n\ntip, sex,\t          - H0 is accepted\ntip, smoker          - H0 is accepted\ntip, day            - H0 is accepted\ntip, time\t          - H0 is accepted\ntip, size            - H0 is rejected\n\n            Chi-Square Test\nsex, smoker          - H0 is accepted\nsex, day              - H0 is rejected\nsex, time\t          - H0 is rejected\nsex, size           - H0 is accepted\n\nday, time\t        - H0 is rejected\nday, size\t         - H0 is rejected\n\ntime, size\t         - H0 is rejected\n'

# Correlation

In [9]:
df.corr()

  df.corr()


Unnamed: 0,total_bill,tip,size
total_bill,1.0,0.675734,0.598315
tip,0.675734,1.0,0.489299
size,0.598315,0.489299,1.0


In [18]:
def hypothesisTesting(continuous_columns = [], category_columns = [], sampling_rate = 0.05):
  #Central Limit Theorem
  oneContinesColumn_result = {}
  comperativeColumn_result = {}
  if continuous_columns:
    samplesize = int(sampling_rate * len(df[continuous_columns[0]]))
    for column in continuous_columns:
      population=df[column].values
      population_mean = population.mean()
      sample_mean=[]
      for i in range(40):
        sample=np.random.choice(population,samplesize)
        sample_mean.append(sample.mean())
      oneContinesColumn_result[column] = {
                                          "Population Mean": population_mean,
                                          "Sample Means": {np.mean(sample_mean)}
                                          }

  #1-Tailed Test
  if continuous_columns:
    samplesize = int(sampling_rate * len(df[continuous_columns[0]]))
    for column in continuous_columns:
      H0_accepted = 0
      H0_rejected = 0
      for i in range(samplesize):
        sample=df[column].sample(frac=0.05)
        t_test,p_value=stats.ttest_1samp(sample,df[column].mean())
        if p_value > 0.5:
          H0_accepted += 1
        else:
          H0_rejected += 1

      if H0_accepted > H0_rejected:
        oneContinesColumn_result[column].update( {
                                            "H0_accepted": H0_accepted,
                                            "H0_rejected": H0_rejected,
                                            "Conclusion": "H0 is accepted, Ha is rejected, There is no significant effect"
                                          } )
      else:
          oneContinesColumn_result[column].update( {
                                              "H0_accepted": H0_accepted,
                                              "H0_rejected": H0_rejected,
                                              "Conclusion": "H0 is rejected, Ha is accepted, There is a significant effect"
                                              } )

  #2-Tailed Test
  if continuous_columns:
    for i in range(len(continuous_columns) - 1):
      column_1 = continuous_columns[i]
      for column_2 in continuous_columns[i+1:]:
        H0_accepted = 0
        H0_rejected = 0
        for i in range(20):
          sample1 = df[column_1].sample(frac=0.04)
          sample2 = df[column_2].sample(frac=0.04)
          t_test, p_value = stats.ttest_ind(sample1, sample2)
          if p_value > 0.5:
            H0_accepted += 1
          else:
            H0_rejected += 1

        if H0_accepted > H0_rejected:
          comperativeColumn_result[column_1 +' & '+ column_2] = {
                  "H0_accepted": H0_accepted,
                  "H0_rejected": H0_rejected,
                  "Conclusion": "H0 is accepted, Ha is rejected, There is no significant effect"
              }
        else:
            comperativeColumn_result[column_1 +' & '+ column_2] = {
                "H0_accepted": H0_accepted,
                "H0_rejected": H0_rejected,
                "Conclusion": "H0 is rejected, Ha is accepted, There is a significant effect"
            }

  #Chi-Square Test
  if category_columns:
    for i in range(len(category_columns) - 1):
      column_1 = category_columns[i]
      for column_2 in category_columns[i+1:]:
        data = pd.crosstab(df[column_1], df[column_2])
        observed_values = data.values
        chi2_stat, p_value, _, _= stats.chi2_contingency(observed_values)

        if p_value > 0.05:
          comperativeColumn_result[column_1 +" & "+ column_2] = "H0 is accepted, There is no relationship between two columns we're comparing"
        else:
          comperativeColumn_result[column_1 +" & "+ column_2] = "H0 is rejected, There is a relationship between two columns we're comparing"

  #ANOVA
  if continuous_columns and category_columns:
    for category_column in category_columns:
      for continuous_column in continuous_columns:
        group = df[category_column].unique()
        data = {}
        for i in group:
          data[i]=df[continuous_column][df[category_column]==i]

        f_value, p_value = stats.f_oneway(*[data[i] for i in group])
        if p_value > 0.05:
          comperativeColumn_result[category_column +" & "+ continuous_column] = "H0 is accepted, There is no relationship between two columns we're comparing"
        else:
          comperativeColumn_result[category_column +" & "+ continuous_column] = "H0 is rejected, There is a relationship between two columns we're comparing"
  return oneContinesColumn_result, comperativeColumn_result



In [21]:
ContinesColumn_result, comperativeColumn_result = hypothesisTesting(continuous_columns , category_columns)

In [22]:
ContinesColumn_result

{'total_bill': {'Population Mean': 19.78594262295082,
  'Sample Means': {19.647812499999997},
  'H0_accepted': 3,
  'H0_rejected': 9,
  'Conclusion': 'H0 is rejected, Ha is accepted, There is a significant effect'},
 'tip': {'Population Mean': 2.99827868852459,
  'Sample Means': {2.9922291666666667},
  'H0_accepted': 7,
  'H0_rejected': 5,
  'Conclusion': 'H0 is accepted, Ha is rejected, There is no significant effect'}}

In [24]:
ContinesColumn_result_df = pd.DataFrame(ContinesColumn_result)

In [25]:
ContinesColumn_result_df

Unnamed: 0,total_bill,tip
Population Mean,19.785943,2.998279
Sample Means,{19.647812499999997},{2.9922291666666667}
H0_accepted,3,7
H0_rejected,9,5
Conclusion,"H0 is rejected, Ha is accepted, There is a sig...","H0 is accepted, Ha is rejected, There is no si..."


In [23]:
comperativeColumn_result

{'total_bill & tip': {'H0_accepted': 0,
  'H0_rejected': 20,
  'Conclusion': 'H0 is rejected, Ha is accepted, There is a significant effect'},
 'sex & smoker': "H0 is accepted, There is no relationship between two columns we're comparing",
 'sex & day': "H0 is rejected, There is a relationship between two columns we're comparing",
 'sex & time': "H0 is rejected, There is a relationship between two columns we're comparing",
 'sex & size': "H0 is accepted, There is no relationship between two columns we're comparing",
 'smoker & day': "H0 is rejected, There is a relationship between two columns we're comparing",
 'smoker & time': "H0 is accepted, There is no relationship between two columns we're comparing",
 'smoker & size': "H0 is accepted, There is no relationship between two columns we're comparing",
 'day & time': "H0 is rejected, There is a relationship between two columns we're comparing",
 'day & size': "H0 is rejected, There is a relationship between two columns we're comparing"

In [26]:
comperativeColumn_result_df = pd.DataFrame(comperativeColumn_result)

In [30]:
print(comperativeColumn_result_df.columns)

Index(['total_bill & tip', 'sex & smoker', 'sex & day', 'sex & time',
       'sex & size', 'smoker & day', 'smoker & time', 'smoker & size',
       'day & time', 'day & size', 'time & size', 'sex & total_bill',
       'sex & tip', 'smoker & total_bill', 'smoker & tip', 'day & total_bill',
       'day & tip', 'time & total_bill', 'time & tip', 'size & total_bill',
       'size & tip'],
      dtype='object')


In [32]:
comperativeColumn_result_df

Unnamed: 0,total_bill & tip,sex & smoker,sex & day,sex & time,sex & size,smoker & day,smoker & time,smoker & size,day & time,day & size,...,sex & total_bill,sex & tip,smoker & total_bill,smoker & tip,day & total_bill,day & tip,time & total_bill,time & tip,size & total_bill,size & tip
Conclusion,"H0 is rejected, Ha is accepted, There is a sig...","H0 is accepted, There is no relationship betwe...","H0 is rejected, There is a relationship betwee...","H0 is rejected, There is a relationship betwee...","H0 is accepted, There is no relationship betwe...","H0 is rejected, There is a relationship betwee...","H0 is accepted, There is no relationship betwe...","H0 is accepted, There is no relationship betwe...","H0 is rejected, There is a relationship betwee...","H0 is rejected, There is a relationship betwee...",...,"H0 is rejected, There is a relationship betwee...","H0 is accepted, There is no relationship betwe...","H0 is accepted, There is no relationship betwe...","H0 is accepted, There is no relationship betwe...","H0 is rejected, There is a relationship betwee...","H0 is accepted, There is no relationship betwe...","H0 is rejected, There is a relationship betwee...","H0 is accepted, There is no relationship betwe...","H0 is rejected, There is a relationship betwee...","H0 is rejected, There is a relationship betwee..."
H0_accepted,0,"H0 is accepted, There is no relationship betwe...","H0 is rejected, There is a relationship betwee...","H0 is rejected, There is a relationship betwee...","H0 is accepted, There is no relationship betwe...","H0 is rejected, There is a relationship betwee...","H0 is accepted, There is no relationship betwe...","H0 is accepted, There is no relationship betwe...","H0 is rejected, There is a relationship betwee...","H0 is rejected, There is a relationship betwee...",...,"H0 is rejected, There is a relationship betwee...","H0 is accepted, There is no relationship betwe...","H0 is accepted, There is no relationship betwe...","H0 is accepted, There is no relationship betwe...","H0 is rejected, There is a relationship betwee...","H0 is accepted, There is no relationship betwe...","H0 is rejected, There is a relationship betwee...","H0 is accepted, There is no relationship betwe...","H0 is rejected, There is a relationship betwee...","H0 is rejected, There is a relationship betwee..."
H0_rejected,20,"H0 is accepted, There is no relationship betwe...","H0 is rejected, There is a relationship betwee...","H0 is rejected, There is a relationship betwee...","H0 is accepted, There is no relationship betwe...","H0 is rejected, There is a relationship betwee...","H0 is accepted, There is no relationship betwe...","H0 is accepted, There is no relationship betwe...","H0 is rejected, There is a relationship betwee...","H0 is rejected, There is a relationship betwee...",...,"H0 is rejected, There is a relationship betwee...","H0 is accepted, There is no relationship betwe...","H0 is accepted, There is no relationship betwe...","H0 is accepted, There is no relationship betwe...","H0 is rejected, There is a relationship betwee...","H0 is accepted, There is no relationship betwe...","H0 is rejected, There is a relationship betwee...","H0 is accepted, There is no relationship betwe...","H0 is rejected, There is a relationship betwee...","H0 is rejected, There is a relationship betwee..."


# **Result Hypothesis**

In [13]:
def hypothesis_test(continuous_columns, category_columns, sampling_rate = 0.05):
  centralLimitTheorem_result = centralLimitTheorem(continuous_columns, sampling_rate)
  onetailtest_result = oneTailTest(continuous_columns, sampling_rate)
  twotailtest_result = twoTailTest(continuous_columns, sampling_rate)
  Chi_Square_Test_result = chiSquareTest(category_columns)
  anova_result = anova(category_columns, continuous_columns)

In [14]:
centralLimitTheorem_result = centralLimitTheorem(continuous_columns, 0.25)
print("Central Limit Theorem result :\n")
for column, value in centralLimitTheorem_result.items():
  print(f"{column} : {value} \n")

sampling_rate = 0.05
onetailtest_result = oneTailTest(continuous_columns, sampling_rate)
print("\n1 sample t_test :\n")
for column, value in onetailtest_result.items():
    print(f"{column}: H0 accepted {value['H0_accepted']} times, H0 rejected {value['H0_rejected']} times")
    print(value['Conclusion'],'\n')

twotailtest_result = twoTailTest(continuous_columns)
print("\n2 sample t_test :\n")
for column, value in twotailtest_result.items():
    print(f"{column}: H0 accepted {value['H0_accepted']} times, H0 rejected {value['H0_rejected']} times")
    print(value['Conclusion'],'\n')

Chi_Square_Test_result = chiSquareTest(category_columns)
print("\nChi Square Test result :\n")
for column, value in Chi_Square_Test_result.items():
  print(f"{column} : {value} \n")

anova_result = anova(category_columns, continuous_columns)
print("\nANOVA Test :\n")
for column, value in Chi_Square_Test_result.items():
  print(f"{column} : {value} \n")

NameError: name 'centralLimitTheorem' is not defined

# **Data Visualization**



Continuous - Histogram, Boxplot, Violin Plot

Continuous vs Continuous - [Scatter / 3D scatter plot]

Categorical vs Categorical - Stacked Bar Chart, [Heatmap / Correlation Matrix]

Continuous vs Categorical - Bar Plot, [pie chart / donut chart], Clustered Bar Chart, Line

One Continuous vs More Categorical - sunburst plot, [stacked bar / grouped bar]


More: count plot, mosaic bar

In [None]:
correlation_matrix = df.corr()
fig = px.imshow(correlation_matrix, color_continuous_scale='Viridis',
                title="Correlation Matrix")
fig.show()

In [None]:
#total_bill,	tip
fig = px.scatter(data_frame=df, x="total_bill", y="tip", color="sex", trendline="ols",
                 title="Scatter Plot")
fig.show()


In [None]:
#total_bill,	tip
fig = px.line(data_frame=df, x="total_bill", y="tip", color="sex", title="Line Plot")
fig.show()

In [None]:
#total_bill, day
df = px.data.tips()
fig = px.bar(data_frame=df, x="day", y="total_bill", color="sex", barmode="group",
             title="Clustered Bar Chart")
fig.show()


In [None]:
#sex, smoker
fig = px.bar(data_frame=df, x="sex", y="smoker", color="sex", barmode="stack",
             title="Stacked Bar Chart")
fig.show()


In [None]:
#tip, day
fig = px.bar(df, x = 'day',
                 y = 'tip',

                 title = 'Day vs Tip'
                )
fig.show()

In [None]:
pivot_table = df.pivot_table(values='tip', index='sex', columns='day', aggfunc='count')

print(pivot_table)

# **Preprocessing**

In [None]:
df.time.unique()

In [None]:
# Replacing categorical values with numerical equivalents
df['sex'] = df['sex'].map({'Female': 101, 'Male': 102})
df['smoker'] = df['smoker'].map({'No': 201, 'Yes': 202})
df['day'] = df['day'].replace({'Sun': 301, 'Thur': 302, 'Fri': 303, 'Sat' : 304})
df['time'] = df['time'].replace({'Dinner': 401, 'Lunch': 402})
df

# **Machine Learning**

# Supervised Regression - Linear regression

Step 1: Data Availability:
Ensure you have your independent variable(s) x1 and dependent variable y

Step 2: Separating Independent and Dependent Variables:

x1 = Independent_variable_data

y = Dependent_variable_data

Step 3: Creating and Fitting the Model
Add a constant term to the independent variable:

x = sm.add_constant(x1)

Step 4: Create an Ordinary Least Squares (OLS) model:

model = sm.OLS(y, x)

Step 5: Traing the model:

result = model.fit()

Step 6: Evaluation:
Print the summary of the regression results

print(result.summary())

Step 7: Predictions:
Predict y values based on the fitted model

yhat = result.predict(x)


In [None]:
'''
x1 = df.independent_column
y = df.dependent_column
x = sm.add_constant(x1)
model = sm.OLS(y,x)
result = model.fit()
result.summary()
yhat = result.params[0]+result.params[1]* x1
'''

# Supervised Classification - Logistic regression

x1 = Independent

y = Dependent

x=sm.add_constant(x1)

result_log = sm.Logit(y,x).fit()

result_log.pred_table()

confusion_matrix_df=pd.DataFrame(result_log.pred_table())

confusion_matrix=np.array(confusion_matrix_df)

acc=(confusion_matrix[0,0]+confusion_matrix[1,1])/confusion_matrix.sum()

result_log.summary()