In [105]:
# Box - Cox 
# The Box-cox transformation is appropiate for positive data that does not contain zero values. it assumes that the data follows a normal distribution and applies a power transformation to achieve noramlity. The Box-Cox transformation is a more rigid transformation comapred to the yeo-Jonshon transforamtion,  because it requires the data to be strictly positive and cannot handel zero values.


In [106]:
import numpy as np
from sklearn.preprocessing import PowerTransformer

In [107]:
data = np.random.gamma(1,2, size=(100,1)) # Genrate some random data with a skewed distribution 

# Instantitate a PowerTramsformer
pt = PowerTransformer(method = 'box-cox')

# fit the PowerTransformer to  the data and transfornm it
transformed_data = pt.fit_transform(data)

# print the original and transformed data to comapare
print('Original data:\n' , data[:5])
print('Transformed data:\n' , transformed_data[:5])

Original data:
 [[1.84907224]
 [2.76599166]
 [0.02909284]
 [0.35911817]
 [0.31388209]]
Transformed data:
 [[ 0.20220304]
 [ 0.59119689]
 [-1.91228426]
 [-0.96210201]
 [-1.03393726]]


In [108]:
# yeo-Johnson = The Yeo-Jhonson transformation is a more flexible transformation that can be apllied to both positive and negative data, including zero values. it also assumes that the data follows a normal distribution, but it applies a slighitly different power transfomation than the box-cox transforamtion. the Yeo-Johnson transformation is a more robust transforamtion that can handel a wider range of data types and distribution compared to the box-cox transformation.


In [109]:
data = np.random.gamma(1,2, size=(100,2))
pt = PowerTransformer(method='yeo-johnson')
transformed_data = pt.fit_transform(data)
print('original data:\n', data[:5])
print('transformed data:\n', transformed_data[:5])

original data:
 [[3.45850041 1.66580785]
 [3.40777771 1.83256534]
 [2.21335906 0.73246717]
 [0.4049987  1.03800177]
 [0.10363918 0.47805668]]
transformed data:
 [[ 1.02670345  0.30015448]
 [ 1.00994788  0.40263462]
 [ 0.51560789 -0.50018757]
 [-1.11535915 -0.18300039]
 [-1.70173668 -0.82993428]]


In [110]:
# apply on data 

In [111]:
from sklearn.model_selection import train_test_split

In [112]:
from sklearn.linear_model import LinearRegression 

In [113]:
x = np.random.normal(loc = 100, scale = 10, size=(1000,5))
y = np.random.normal(loc = 100, scale = 10, size=1000)

In [114]:
x_train, x_test , y_train , y_test = train_test_split(x,y,test_size=0.1, random_state=42)

In [115]:
# fit and tranform the data using box- cox method
boxcox_transformer = PowerTransformer(method='box-cox', standardize=True)
x_train_bc = boxcox_transformer.fit_transform(x_train)
x_test_bc = boxcox_transformer.fit_transform(x_test)

In [116]:
# fit and tranform the data using yeo-johnson method
yeojohnson_transformer = PowerTransformer(method = 'yeo-johnson', standardize=True)
x_train_yj = yeojohnson_transformer.fit_transform(x_train)
x_test_yj = yeojohnson_transformer.fit_transform(x_test)

In [117]:
# Pipe line funnction


In [118]:
import pandas as pd

In [119]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.datasets import fetch_california_housing
from sklearn.compose import ColumnTransformer

In [120]:
housing = fetch_california_housing(as_frame = True) # load housing dataset

# define the numeric features and categorical features 
numeric_features = housing.feature_names[:2]
categorical_features = housing.feature_names[2:]

# Define the preprocessing pipelines for the numeric features and the categorical features
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [121]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [122]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                          ('regressor', LinearRegression())])

In [123]:
x = pd.DataFrame(housing.data, columns= housing.feature_names)
y = pd.Series(housing.target)

In [124]:
pipeline.fit(x,y)

In [125]:
x_new = x.iloc[:10]
y_pred = pipeline.predict(x_new)
print(y_pred)

[4.5263782  3.58492154 3.52075949 3.41311554 3.42219836 2.69722465
 2.99204185 2.41406741 2.26710339 2.6113283 ]


In [126]:
df= pd.read_csv('C:\\Users\\gajendra singh\\OneDrive\\Desktop\\pandas\\covid_toy.csv')
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [127]:
#from sklearn.preprocessing import LabelEncoder
#lb = LabelEncoder()
#df['gender']=lb.fit_transform(df['gender'])
#df['cough']=lb.fit_transform(df['cough'])
#df['city']=lb.fit_transform(df['city'])


In [128]:
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression 
x = df.drop('has_covid', axis=1)
y=df['has_covid']

In [129]:
x_train, x_test , y_train , y_test = train_test_split(x,y,test_size=0.1, random_state=42)

In [130]:
categorical_features = ['gender', 'city']
numerical_features = ['age', 'fever']

In [131]:
# create transformer
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy= 'mean')),
    ('scaler', StandardScaler())
])


In [132]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy= 'most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [133]:
# Column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [134]:
# create the pipline
clf= Pipeline(steps=[('preprocessor', preprocessor),
                   ('classifier', LogisticRegression())])

In [135]:
# train the model 
clf.fit(x_train, y_train)

# evaluate the model
y_pred= clf.predict(x_test)

In [136]:
y_pred

array(['No', 'Yes', 'No', 'Yes', 'No', 'No', 'Yes', 'No', 'No', 'No'],
      dtype=object)

In [137]:
from sklearn.metrics import accuracy_score

In [138]:
acc = accuracy_score(y_test,y_pred)
acc

0.8

In [139]:
df1 = pd.read_csv('C:\\Users\\gajendra singh\\OneDrive\\Desktop\\pandas\\Social_Network_Ads.csv')

In [140]:
df1

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [141]:
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [142]:
df1.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [143]:
df1 = df1.drop(columns=['User ID', 'Gender'])

In [144]:
x = df1.drop(columns=['Purchased'], axis=1)
y= df1['Purchased']

In [145]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [147]:
pipe=Pipeline([
    ('scaler', StandardScaler()),
    ('pca',PCA(n_components=2)),
    ('classifiers',RandomForestClassifier(n_estimators=100, random_state=42))
])

In [148]:
pipe

In [149]:
pipe.fit(x_train,y_train)

In [150]:
y_pred=pipe.predict(x_test)

In [151]:
acc = accuracy_score(y_test , y_pred)
print(acc)

0.8875


In [152]:
df2 = pd.read_csv('placement.csv')

In [153]:
df2

Unnamed: 0,cgpa,resume_score,placed
0,8.14,6.52,1
1,6.17,5.17,0
2,8.27,8.86,1
3,6.88,7.27,1
4,7.52,7.30,1
...,...,...,...
95,6.33,6.38,0
96,8.23,7.76,1
97,6.65,7.78,0
98,8.14,5.63,1


In [154]:
df2.head()

Unnamed: 0,cgpa,resume_score,placed
0,8.14,6.52,1
1,6.17,5.17,0
2,8.27,8.86,1
3,6.88,7.27,1
4,7.52,7.3,1


In [155]:
x = df2.drop(columns=['placed'],axis=1)s
y= df2['placed']

In [156]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [163]:
boxcox_transformer=PowerTransformer(method='box-cox', standardize=True)
x_train_bc= boxcox_transformer.fit_transform(x_train)
x_test_bc= boxcox_transformer.fit_transform(x_test)

In [164]:
yeojohnson_transformer=PowerTransformer(method='yeo-johnson', standardize=True)

In [165]:
x_train_yj=yeojohnson_transformer.fit_transform(x_train)
x_test_yj=yeojohnson_transformer.fit_transform(x_test)

In [167]:
numerical_feature=['cgpa','resume_score']

In [168]:
numerical_transform = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())
])

In [170]:
preprocessor=ColumnTransformer(
    transformers=[('num', numerical_transform, numerical_feature)
])

In [171]:
# create the pipline
clf= Pipeline(steps=[('preprocessor', preprocessor),
                   ('classifier', LogisticRegression())])

In [172]:
clf.fit(x_train,y_train)
y_pred=clf.predict(x_test)

In [173]:
y_pred

array([1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0],
      dtype=int64)