# Missing Data Application

##### This Python code performs data preprocessing and classification operations on the missing dataset. Firstly, the data is read from the CSV file using the pandas library and the columns with missing data are filled with SimpleImputer. After the missing data filling process, the data is converted into categorical data into numerical data with LabelEncoder and OneHotEncoder. Then, these data are transferred to pandas DataFrames and merged. The data is split into training and test sets (train_test_split), then the features are scaled using StandardScaler. A Logistic Regression model is created (LogisticRegression), trained with the training set and predicted with the test set. Finally, the predicted and actual values are printed on the screen.

In [13]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as sk

data = pd.read_csv("C:\\Users\\Arif Furkan\\OneDrive\\Belgeler\\Python_kullanirken\\eksikveriler.csv")
print(data)

   Country  Size  Weight   Age sex
0       tr   130      30  10.0   m
1       tr   125      36  11.0   m
2       tr   135      34  10.0   w
3       tr   133      30   9.0   m
4       tr   129      38  12.0   m
5       tr   180      90  30.0   m
6       tr   190      80  25.0   m
7       tr   175      90  35.0   m
8       tr   177      60  22.0   w
9       us   185     105  33.0   m
10      us   165      55  27.0   w
11      us   155      50  44.0   w
12      us   160      58   NaN   w
13      us   162      59  41.0   w
14      us   167      62  55.0   w
15      fr   174      70  47.0   m
16      fr   193      90   NaN   m
17      fr   187      80  27.0   m
18      fr   183      88  28.0   m
19      fr   159      40  29.0   w
20      fr   164      66  32.0   w
21      fr   166      56  42.0   w


## Filling Missing Data

In [14]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
Age = data.iloc[:, 1:4].values  
print(Age)
imputer = imputer.fit(Age[:, 1:4]) 
Age[:, 1:4] = imputer.transform(Age[:, 1:4])
print(Age)
Country = data.iloc[:, 0:1].values
print(Country)

[[130.  30.  10.]
 [125.  36.  11.]
 [135.  34.  10.]
 [133.  30.   9.]
 [129.  38.  12.]
 [180.  90.  30.]
 [190.  80.  25.]
 [175.  90.  35.]
 [177.  60.  22.]
 [185. 105.  33.]
 [165.  55.  27.]
 [155.  50.  44.]
 [160.  58.  nan]
 [162.  59.  41.]
 [167.  62.  55.]
 [174.  70.  47.]
 [193.  90.  nan]
 [187.  80.  27.]
 [183.  88.  28.]
 [159.  40.  29.]
 [164.  66.  32.]
 [166.  56.  42.]]
[[130.    30.    10.  ]
 [125.    36.    11.  ]
 [135.    34.    10.  ]
 [133.    30.     9.  ]
 [129.    38.    12.  ]
 [180.    90.    30.  ]
 [190.    80.    25.  ]
 [175.    90.    35.  ]
 [177.    60.    22.  ]
 [185.   105.    33.  ]
 [165.    55.    27.  ]
 [155.    50.    44.  ]
 [160.    58.    28.45]
 [162.    59.    41.  ]
 [167.    62.    55.  ]
 [174.    70.    47.  ]
 [193.    90.    28.45]
 [187.    80.    27.  ]
 [183.    88.    28.  ]
 [159.    40.    29.  ]
 [164.    66.    32.  ]
 [166.    56.    42.  ]]
[['tr']
 ['tr']
 ['tr']
 ['tr']
 ['tr']
 ['tr']
 ['tr']
 ['tr']
 ['tr']
 [

## Encoding Categorical Data

In [15]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder() 
Country[:, 0] = le.fit_transform(data.iloc[:, 0])
print(Country)
ohe = preprocessing.OneHotEncoder()  
Country = ohe.fit_transform(Country).toarray() 
print(Country)

[[1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [2]
 [2]
 [2]
 [2]
 [2]
 [2]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]]
[[0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]


## Creating DataFrames

In [16]:
print(list(range(22)))
result = pd.DataFrame(data=Country, index=range(22), columns=['fr', 'tr', 'us'])
print(result)
result2 = pd.DataFrame(data=Age, index=range(22), columns=['Size', 'Weight', 'Age'])
print(result2)
sex = data.iloc[:, -1].values
print(sex)
result3 = pd.DataFrame(data=sex, index=range(22), columns=['sex'])
print(result3) 

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
     fr   tr   us
0   0.0  1.0  0.0
1   0.0  1.0  0.0
2   0.0  1.0  0.0
3   0.0  1.0  0.0
4   0.0  1.0  0.0
5   0.0  1.0  0.0
6   0.0  1.0  0.0
7   0.0  1.0  0.0
8   0.0  1.0  0.0
9   0.0  0.0  1.0
10  0.0  0.0  1.0
11  0.0  0.0  1.0
12  0.0  0.0  1.0
13  0.0  0.0  1.0
14  0.0  0.0  1.0
15  1.0  0.0  0.0
16  1.0  0.0  0.0
17  1.0  0.0  0.0
18  1.0  0.0  0.0
19  1.0  0.0  0.0
20  1.0  0.0  0.0
21  1.0  0.0  0.0
     Size  Weight    Age
0   130.0    30.0  10.00
1   125.0    36.0  11.00
2   135.0    34.0  10.00
3   133.0    30.0   9.00
4   129.0    38.0  12.00
5   180.0    90.0  30.00
6   190.0    80.0  25.00
7   175.0    90.0  35.00
8   177.0    60.0  22.00
9   185.0   105.0  33.00
10  165.0    55.0  27.00
11  155.0    50.0  44.00
12  160.0    58.0  28.45
13  162.0    59.0  41.00
14  167.0    62.0  55.00
15  174.0    70.0  47.00
16  193.0    90.0  28.45
17  187.0    80.0  27.00
18  183.0    88.0  28.00
19  159

## Merging All Data

In [17]:
r = pd.concat([result, result2], axis=1) 
print(r)
r2 = pd.concat([r, result3], axis=1)
print(r2) 

     fr   tr   us   Size  Weight    Age
0   0.0  1.0  0.0  130.0    30.0  10.00
1   0.0  1.0  0.0  125.0    36.0  11.00
2   0.0  1.0  0.0  135.0    34.0  10.00
3   0.0  1.0  0.0  133.0    30.0   9.00
4   0.0  1.0  0.0  129.0    38.0  12.00
5   0.0  1.0  0.0  180.0    90.0  30.00
6   0.0  1.0  0.0  190.0    80.0  25.00
7   0.0  1.0  0.0  175.0    90.0  35.00
8   0.0  1.0  0.0  177.0    60.0  22.00
9   0.0  0.0  1.0  185.0   105.0  33.00
10  0.0  0.0  1.0  165.0    55.0  27.00
11  0.0  0.0  1.0  155.0    50.0  44.00
12  0.0  0.0  1.0  160.0    58.0  28.45
13  0.0  0.0  1.0  162.0    59.0  41.00
14  0.0  0.0  1.0  167.0    62.0  55.00
15  1.0  0.0  0.0  174.0    70.0  47.00
16  1.0  0.0  0.0  193.0    90.0  28.45
17  1.0  0.0  0.0  187.0    80.0  27.00
18  1.0  0.0  0.0  183.0    88.0  28.00
19  1.0  0.0  0.0  159.0    40.0  29.00
20  1.0  0.0  0.0  164.0    66.0  32.00
21  1.0  0.0  0.0  166.0    56.0  42.00
     fr   tr   us   Size  Weight    Age sex
0   0.0  1.0  0.0  130.0    30.0  10

## Splitting Data into Training and Testing Sets

In [18]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(r, result3, test_size=0.33, random_state=0)

## Feature Scaling

In [20]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler() 
X_train = sc.fit_transform(x_train)
X_test = sc.transform(x_test) 
y_train = np.ravel(y_train.to_numpy())
y_test = np.ravel(y_test.to_numpy())
from sklearn.linear_model import LogisticRegression
logr = LogisticRegression(random_state=0) 
logr.fit(X_train, y_train)
y_pred = logr.predict(X_test)
print(y_pred)
print(y_test)

['m' 'w' 'm' 'w' 'm' 'm' 'w' 'm']
['w' 'w' 'w' 'w' 'm' 'w' 'w' 'w']
