<a href="https://colab.research.google.com/github/Charan4311/Khub_9_/blob/main/data_preprocessing_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing the Libraries

In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Importing the Dataset

In [6]:
dataset = pd.read_csv("Data1.csv")
print(dataset.head())



   Country   Age    Salary Purchased
0    Italy  29.0  109022.0       Yes
1       UK  53.0   38743.0       Yes
2  Germany  21.0   68368.0       Yes
3       UK  22.0  110025.0       Yes
4       UK  54.0   94172.0        No


In [7]:
print(dataset.tail())

     Country   Age   Salary Purchased
995    Spain  27.0  70100.0        No
996   France   NaN  28179.0        No
997   France  30.0  68781.0        No
998    Italy  48.0      NaN       Yes
999  Germany  24.0      NaN        No


In [8]:
print(dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    1000 non-null   object 
 1   Age        900 non-null    float64
 2   Salary     900 non-null    float64
 3   Purchased  1000 non-null   object 
dtypes: float64(2), object(2)
memory usage: 31.4+ KB
None


# Separating Features (X) and Target (y)

In [9]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

print(" Features (X):", X[:5] )
print(" Target (y):", y[:5] )



 Features (X): [['Italy' 29.0 109022.0]
 ['UK' 53.0 38743.0]
 ['Germany' 21.0 68368.0]
 ['UK' 22.0 110025.0]
 ['UK' 54.0 94172.0]]
 Target (y): ['Yes' 'Yes' 'Yes' 'Yes' 'No']


# Taking care of missing data

In [14]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X[:, 1:3] = imputer.fit_transform(X[:, 1:3])

print(" After Handling Missing Data:", X[:5])

 After Handling Missing Data: [['Italy' 29.0 109022.0]
 ['UK' 53.0 38743.0]
 ['Germany' 21.0 68368.0]
 ['UK' 22.0 110025.0]
 ['UK' 54.0 94172.0]]


# Encoding Categorical Data

In [18]:

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])],
                       remainder='passthrough')
X = np.array(ct.fit_transform(X))



le = LabelEncoder()
y = le.fit_transform(y)

print(" Encoded Target (y):\n", y[:5], "\n")



 Encoded Target (y):
 [1 1 1 1 0] 



# Splitting Dataset into Training & Test Sets

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1
)

print( X_train[:5], "\n")
print( X_test[:5], "\n")


[[1.0 0.0 0.0 0.0 0.0 39.02111111111111 112260.0]
 [0.0 0.0 0.0 0.0 1.0 48.0 113812.0]
 [0.0 0.0 1.0 0.0 0.0 59.0 75296.0]
 [0.0 0.0 0.0 0.0 1.0 24.0 61621.0]
 [1.0 0.0 0.0 0.0 0.0 21.0 69131.59555555556]] 

[[0.0 0.0 1.0 0.0 0.0 27.0 66486.0]
 [0.0 0.0 0.0 0.0 1.0 23.0 116468.0]
 [0.0 0.0 0.0 0.0 1.0 51.0 109489.0]
 [0.0 0.0 1.0 0.0 0.0 41.0 48602.0]
 [1.0 0.0 0.0 0.0 0.0 19.0 75099.0]] 



# Feature Scaling

In [20]:
sc = StandardScaler()
X_train[:, 2:] = sc.fit_transform(X_train[:, 2:])
X_test[:, 2:] = sc.transform(X_test[:, 2:])

print( X_train[:5], "\n")
print( X_test[:5], "\n")



[[1.0 0.0 -0.5272146351966084 -0.4665361852810153 -0.513637094426537
  -0.010651336552110346 1.5531361821338794]
 [0.0 0.0 -0.5272146351966084 -0.4665361852810153 1.9468998848622632
  0.7637386830400456 1.60955476743711]
 [0.0 0.0 1.8967606990406713 -0.4665361852810153 -0.513637094426537
  1.71244072411941 0.2094141543319347]
 [0.0 0.0 -0.5272146351966084 -0.4665361852810153 1.9468998848622632
  -1.3061566793149313 -0.2877019242902813]
 [1.0 0.0 -0.5272146351966084 -0.4665361852810153 -0.513637094426537
  -1.5648935996093032 -0.01467539334364717]] 

[[0.0 0.0 1.8967606990406713 -0.4665361852810153 -0.513637094426537
  -1.0474197590205592 -0.11084856249890426]
 [0.0 0.0 -0.5272146351966084 -0.4665361852810153 1.9468998848622632
  -1.3924023194130553 1.7061061608426387]
 [0.0 0.0 -0.5272146351966084 -0.4665361852810153 1.9468998848622632
  1.0224756033344176 1.452404287884361]
 [0.0 0.0 1.8967606990406713 -0.4665361852810153 -0.513637094426537
  0.16001920235317735 -0.7609709720111317]
 