# Importing libraries 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
pd.set_option("display.max_rows", 100, "display.max_columns", 100)

## Importing dataset 

In [None]:
df=pd.read_csv('../input/tabular-playground-series-jun-2022/data.csv')

In [None]:
df.head(10)

## Dataframe Information

In [None]:
df.info()

## Checking for unique values in integer type attribute 

In [None]:
df.select_dtypes(include=['int64']).nunique().sort_values(ascending=True)

## Checking for missing values in each column 

In [None]:
df.isnull().sum()

## percent of missing values in each column 

In [None]:
pd.options.display.float_format = '{:,.2f} %'.format
(df.isnull().sum()/len(df))*100

In [None]:
pd.options.display.float_format = '{:,.2f}'.format

## Summary Statistics 

In [None]:
df.describe()

## Checking the data distribution of each Continuous variable  

In [None]:
plt.figure(figsize=(18, 18))
for i, col in enumerate(df.select_dtypes(include=['float64']).columns):
    ax = plt.subplot(11,5, i+1)
    sns.histplot(data=df, x=col, ax=ax)
plt.suptitle('Data distribution of continuous variables')
plt.tight_layout()

Here we can see that there are a lot of attributes which are positively or negatively distributed.so we will use power transformation to make these attributes symmetrical.

In [None]:
df1=df[df.select_dtypes(include=['float64']).columns]#separating missing values column

In [None]:
from sklearn.preprocessing import PowerTransformer

we will use Yeo-Johnson transform for transforming our data.A power transform will make the probability distribution of a variable more Gaussian.

In [None]:
power = PowerTransformer(method='yeo-johnson', standardize=False)
df2=power.fit_transform(df1)

In [None]:
df2=pd.DataFrame(df2,columns=list(df1.columns))

## Heatmap

In [None]:
plt.figure(figsize=(18,18))
sns.heatmap(df2.corr(),annot=False)
plt.show()

In [None]:
df2.head()

## Again Checking data distribution after applying power transformation 

In [None]:
plt.figure(figsize=(18,18))
for i,col in enumerate(df2.select_dtypes(include=['float64']).columns):
    ax=plt.subplot(11,5,i+1)
    sns.histplot(data=df2,x=col,ax=ax)
plt.suptitle('density plot')
plt.tight_layout()
plt.show()

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

In [None]:
imp=IterativeImputer(estimator=DecisionTreeRegressor(random_state=0),missing_values=np.nan)

In [None]:
df3=imp.fit_transform(df2)

## final checking for missing values after predicting missing values

In [None]:
df4=pd.DataFrame(df3,columns=df2.columns)

In [None]:
df4.head()

In [None]:
df4.isnull().sum()

## Importing Submission file 

In [None]:
sub=pd.read_csv('../input/tabular-playground-series-jun-2022/sample_submission.csv')

In [None]:
split=sub['row-col'].str.split(pat="-",expand=True)

In [None]:
row=split.iloc[:,0].astype('int64')
col=split.iloc[:,1].astype('str')

In [None]:
val=[]
for i in range(0,len(row)):
    a=row[i]
    b=col[i]
    val.append(df4.loc[a,b])

In [None]:
sub['value']=val
sub.to_csv('final_submission.csv',index=False)