В одной компании произошла неприятность: случился потоп, в бумажных анкетах клиентов размыло некоторые строки и в 1000 анкетах пострадало поле “gender”. Конечно, их было бы можно восстановить, но в компании решили автоматизировать решение проблемы и определить пол клиентов по фотографиям. Был создан датасет gender.csv, в котором записаны признаки человека на фото и его пол.

Первые 1000 значений имеют пропуски в столбце gender.

Все столбцы с ".1" в названии дублируют предыдущие столбцы без ".1", но содержат уникальные значения

Необходимо произвести чистку данных:

Присоединить дублирующие столбцы к основным.
Столбцы 'nose_wide', 'nose_long', 'lips_thin', 'distance_nose_to_lip_long', 'long_hair' содержат информацию в условных единицах. Известно, что измерения проводились разными людьми с разными измерительными инструментами, поэтому данные в столбцах имеют разную погрешность измерений. Необходимо уменьшить влияние погрешности путем приравнивания значений диапазона [0,0.5) к 0, а [0.5, 1) —- к 1. Столбцы 'orehead_width_cm' и 'forehead_height_cm' необходимо оставить «как есть».
Оставить один из дублирующих друг друга столбцов.
Определите пол по остальным параметрам из выборки для первой тысячи значений и прикрепите его в CSV формате. Файл должен содержать только прогнозные значения в формате "Male"/"Female", в одном столбце без заголовка

Пороговое значение для данной задачи классификации считать равным 0.5.

Для успешного прохождения достаточно иметь 70% точность решения

In [1]:
import pandas as pd
%run pipeline.ipynb

In [2]:
df=pd.read_csv('datasets/gender.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,index,long_hair,forehead_width_cm,forehead_height_cm,forehead_width_mm,forehead_width_conventional_units,nose_wide,nose_long,lips_thin,...,long_hair.1,forehead_width_cm.1,forehead_height_cm.1,forehead_width_mm.1,forehead_width_conventional_units.1,nose_wide.1,nose_long.1,lips_thin.1,distance_nose_to_lip_long.1,gender.1
0,0,0,0.704275,11.8,6.1,118.0,0.025806,0.895614,0.477485,0.841261,...,0.818018,11.5,5.8,115.0,0.006452,0.137806,0.40262,0.167844,0.188913,Female
1,1,1,0.11869,14.0,5.4,140.0,0.167742,0.20252,0.186825,0.719697,...,0.995941,15.2,5.8,152.0,0.245161,0.570656,0.980237,0.811487,0.96754,Male
2,2,2,0.203894,11.8,6.3,118.0,0.025806,0.892793,0.723152,0.725821,...,0.633806,15.4,5.7,154.0,0.258065,0.709777,0.365825,0.933026,0.646992,Male
3,3,3,0.386228,14.4,6.1,144.0,0.193548,0.024963,0.562005,0.831515,...,0.795792,13.0,6.9,130.0,0.103226,0.883454,0.399733,0.595028,0.822491,Male
4,4,4,0.56402,13.5,5.9,135.0,0.135484,0.415389,0.063705,0.211584,...,0.915185,14.2,5.8,142.0,0.180645,0.652973,0.735543,0.553571,0.53291,Male


In [3]:
pipe.info.getInfo(df)

Unnamed: 0,DataType,Values,Null,Missing Rate,Unique,Useless
Unnamed: 0,int64,2515,0,0.00%,2515,True
index,int64,2515,0,0.00%,2515,True
long_hair,float64,2515,0,0.00%,2515,False
forehead_width_cm,float64,2515,0,0.00%,42,False
forehead_height_cm,float64,2515,0,0.00%,21,False
forehead_width_mm,float64,2515,0,0.00%,42,False
forehead_width_conventional_units,float64,2515,0,0.00%,42,False
nose_wide,float64,2515,0,0.00%,2515,False
nose_long,float64,2515,0,0.00%,2515,False
lips_thin,float64,2515,0,0.00%,2515,False


Присоединим столбцы

In [4]:
df.columns

Index(['Unnamed: 0', 'index', 'long_hair', 'forehead_width_cm',
       'forehead_height_cm', 'forehead_width_mm',
       'forehead_width_conventional_units', 'nose_wide', 'nose_long',
       'lips_thin', 'distance_nose_to_lip_long', 'gender', 'index.1',
       'long_hair.1', 'forehead_width_cm.1', 'forehead_height_cm.1',
       'forehead_width_mm.1', 'forehead_width_conventional_units.1',
       'nose_wide.1', 'nose_long.1', 'lips_thin.1',
       'distance_nose_to_lip_long.1', 'gender.1'],
      dtype='object')

In [5]:
for col in ['long_hair', 'forehead_width_cm',
            'forehead_height_cm', 'forehead_width_mm',
            'forehead_width_conventional_units', 'nose_wide', 'nose_long',
            'lips_thin', 'distance_nose_to_lip_long']:
    df[col]=(df[col]+df[col+'.1'])/2

In [6]:
for col in ['nose_wide', 'nose_long', 'lips_thin', 'distance_nose_to_lip_long', 'long_hair']:
    df[col]=(df[col]<0.5).astype('int')
df.columns  

Index(['Unnamed: 0', 'index', 'long_hair', 'forehead_width_cm',
       'forehead_height_cm', 'forehead_width_mm',
       'forehead_width_conventional_units', 'nose_wide', 'nose_long',
       'lips_thin', 'distance_nose_to_lip_long', 'gender', 'index.1',
       'long_hair.1', 'forehead_width_cm.1', 'forehead_height_cm.1',
       'forehead_width_mm.1', 'forehead_width_conventional_units.1',
       'nose_wide.1', 'nose_long.1', 'lips_thin.1',
       'distance_nose_to_lip_long.1', 'gender.1'],
      dtype='object')

In [7]:
df=df.drop(columns=['index.1',
       'long_hair.1', 'forehead_width_cm.1', 'forehead_height_cm.1',
       'forehead_width_mm.1', 'forehead_width_conventional_units.1',
       'nose_wide.1', 'nose_long.1', 'lips_thin.1',
       'distance_nose_to_lip_long.1'])

In [8]:
df_p = df[:1001].copy()
df_p

Unnamed: 0.1,Unnamed: 0,index,long_hair,forehead_width_cm,forehead_height_cm,forehead_width_mm,forehead_width_conventional_units,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,gender,gender.1
0,0,0,0,11.65,5.95,116.5,0.016129,0,1,0,0,,Female
1,1,1,0,14.60,5.60,146.0,0.206452,1,0,0,0,,Male
2,2,2,1,13.60,6.00,136.0,0.141935,0,0,0,0,,Male
3,3,3,0,13.70,6.50,137.0,0.148387,1,1,0,0,,Male
4,4,4,0,13.85,5.85,138.5,0.158065,0,1,1,1,,Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...
996,996,996,0,12.05,5.95,120.5,0.041935,1,1,1,0,,Female
997,997,997,0,13.90,5.65,139.0,0.161290,1,1,0,1,,Female
998,998,998,0,13.05,6.50,130.5,0.106452,0,0,0,0,,Male
999,999,999,0,14.20,6.00,142.0,0.180645,1,1,1,0,,Female


In [9]:
df_t=df[1001:].copy()
df_t

Unnamed: 0.1,Unnamed: 0,index,long_hair,forehead_width_cm,forehead_height_cm,forehead_width_mm,forehead_width_conventional_units,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,gender,gender.1
1001,1001,1001,0,13.10,5.90,131.0,0.109677,0,0,0,0,Male,Male
1002,1002,1002,0,12.65,6.10,126.5,0.080645,0,0,0,0,Male,Male
1003,1003,1003,0,13.70,5.85,137.0,0.148387,0,1,0,1,Female,Male
1004,1004,1004,0,13.35,6.50,133.5,0.125806,0,0,0,1,Female,Male
1005,1005,1005,0,13.25,5.70,132.5,0.119355,1,1,0,1,Female,Female
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2510,2510,2510,0,13.75,5.75,137.5,0.151613,0,0,1,1,Male,Female
2511,2511,2511,0,12.85,5.25,128.5,0.093548,0,0,1,0,Male,Female
2512,2512,2512,0,12.55,5.50,125.5,0.074194,1,1,1,1,Female,Female
2513,2513,2513,0,12.65,5.70,126.5,0.080645,1,1,0,0,Male,Female


In [10]:
df_t.drop(columns='gender.1', inplace=True)

In [11]:
df_t=df_t[['long_hair', 'forehead_width_cm',
            'forehead_height_cm', 'forehead_width_mm',
            'forehead_width_conventional_units', 'nose_wide', 'nose_long',
            'lips_thin', 'distance_nose_to_lip_long','gender']]
df_t

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,forehead_width_mm,forehead_width_conventional_units,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,gender
1001,0,13.10,5.90,131.0,0.109677,0,0,0,0,Male
1002,0,12.65,6.10,126.5,0.080645,0,0,0,0,Male
1003,0,13.70,5.85,137.0,0.148387,0,1,0,1,Female
1004,0,13.35,6.50,133.5,0.125806,0,0,0,1,Female
1005,0,13.25,5.70,132.5,0.119355,1,1,0,1,Female
...,...,...,...,...,...,...,...,...,...,...
2510,0,13.75,5.75,137.5,0.151613,0,0,1,1,Male
2511,0,12.85,5.25,128.5,0.093548,0,0,1,0,Male
2512,0,12.55,5.50,125.5,0.074194,1,1,1,1,Female
2513,0,12.65,5.70,126.5,0.080645,1,1,0,0,Male


In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_t.drop(columns='gender'), (df_t['gender']=='Male').astype('int'), test_size=0.25, random_state=111)

In [13]:
#pipe.models.compare_model(pipe.models._get_classification_model_list(),X_train,y_train, scoring='precision')


In [14]:
model = LogisticRegression(random_state=5888)
model.fit(X_train,y_train)
prediction = model.predict(X_test)

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [15]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,prediction)

0.7255936675461742

In [16]:
model.fit(df_t.drop(columns='gender'),df_t['gender'])

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=5888)

In [17]:
df_p=df_p[['long_hair', 'forehead_width_cm',
            'forehead_height_cm', 'forehead_width_mm',
            'forehead_width_conventional_units', 'nose_wide', 'nose_long',
            'lips_thin', 'distance_nose_to_lip_long','gender']]
df_p=df_p.drop(columns='gender')
df_p

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,forehead_width_mm,forehead_width_conventional_units,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long
0,0,11.65,5.95,116.5,0.016129,0,1,0,0
1,0,14.60,5.60,146.0,0.206452,1,0,0,0
2,1,13.60,6.00,136.0,0.141935,0,0,0,0
3,0,13.70,6.50,137.0,0.148387,1,1,0,0
4,0,13.85,5.85,138.5,0.158065,0,1,1,1
...,...,...,...,...,...,...,...,...,...
996,0,12.05,5.95,120.5,0.041935,1,1,1,0
997,0,13.90,5.65,139.0,0.161290,1,1,0,1
998,0,13.05,6.50,130.5,0.106452,0,0,0,0
999,0,14.20,6.00,142.0,0.180645,1,1,1,0


In [18]:
df_p['forehead_width_cm'].fillna(df_p['forehead_width_cm'].median(), inplace=True)
df_p['forehead_height_cm'].fillna(df_p['forehead_width_cm'].median(), inplace=True)
df_p['forehead_width_mm'].fillna(df_p['forehead_width_cm'].median(), inplace=True)
df_p['forehead_width_conventional_units'].fillna(df_p['forehead_width_cm'].median(), inplace=True)
prediction = model.predict(df_p)

In [22]:
pd.DataFrame(prediction).to_csv('gender.csv', header=False, index=False)