<a href="https://colab.research.google.com/github/ARAN1218/SkillsValue_AI/blob/main/jupyter%20notebook/2.coconala_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import scipy.stats as st
pd.set_option('display.max_columns', 50)

## データ読み込み

## 作業自動化・効率化

In [None]:
df_auto = pd.read_pickle('coconala_automation.pickle')
df_auto

### 占い

In [None]:
df_divination = pd.read_pickle('coconala_divination.pickle')
df_divination

### 機械学習・AI

In [None]:
df_ai = pd.read_pickle('coconala_AI.pickle')
df_ai

## 前処理

### No

In [None]:
def No_preprocessing(df):
    del df['No']
    print("No preprocessed!")


In [None]:
# test
df = df_auto.copy()
No_preprocessing(df)
df

## value

In [None]:
def value_preprocessing(df):
    # 「分」と入っているデータは分析対象外とする。
    df.mask(df['value'].str.contains('分'), np.nan, inplace=True)
    df.dropna(subset=['value'], inplace=True)
    
    df['value'] = df['value'].str.translate(str.maketrans({',':'', '円':'', '\n':''}))
    df['value'] = df['value'].astype(int)
    print("value preprocessed!")


In [None]:
# test
df = df_auto.copy()
value_preprocessing(df)
df

In [None]:
df.value.value_counts()

In [None]:
#df[~df['value'].str.contains('分')]

## title

In [None]:
def title_preprocessing(df):
    del df['title']
    print("title preprocessed!")


In [None]:
# test
df = df_auto.copy()
title_preprocessing(df)
df

## gender

In [None]:
def gender_preprocessing(df):
    df['gender'] = df.apply(lambda x : 0 if x['gender'] == '男性' else 0, axis=1)
    print("gender preprocessed!")


In [None]:
# test
df = df_auto.copy()
gender_preprocessing(df)
df

## type

In [None]:
def type_preprocessing(df, type_name):
    # データの種類毎に変える必要があるかも
    df['type'] = type_name
    print("type preprocessed!")


In [None]:
# test
df = df_auto.copy()
type_preprocessing(df, '開発作業自動化・効率化')
df

In [None]:
df.type.dtype

In [None]:
df.type.value_counts()

In [None]:
#df = df_auto.copy()
df_list = list(df[df['type'] == '開発IT・プログラミン'].index)
df = df_auto.copy()
df.iloc[df_list]

## pro

In [None]:
def pro_preprocessing(df):
    df['pro'] = df['pro'].astype(int)
    print("pro preprocessed!")


In [None]:
# test
df = df_auto.copy()
pro_preprocessing(df)
df

In [None]:
df.pro.dtype

## rank

In [None]:
def rank_preprocessing(df):
    df['rank'] = df['rank'].astype(int)
    print("rank preprocessed!")


In [None]:
# test
df = df_auto.copy()
rank_preprocessing(df)
df

In [None]:
df['rank'].dtype

## achievements

In [None]:
df.achievements.value_counts()

In [None]:
def achievements_preprocessing(df):
    df['achievements'] = df['achievements'].astype(str)
    df['achievements'] = df['achievements'].str.translate(str.maketrans({',':''}))
    df['achievements'] = df.apply(lambda x : x['achievements'].split('：')[1].split('件')[0] if '：' in x['achievements'] else 0, axis=1)
    df['achievements'] = df['achievements'].astype(int)
    print("achievements preprocessed!")


In [None]:
# test
df = df_auto.copy()
achievements_preprocessing(df)
df

In [None]:
df.achievements.value_counts()

In [None]:
df[df['achievements'].isnull()]

## サービス内容/提案のわかりやすさ・コミュニケーション・クオリティ・納期/スケジュール・rank_sample・評価・販売実績・残り

In [None]:
df['クオリティ'].value_counts()

In [None]:
def scores_preprocessing(df):
    # NaNは0に変換する
    df.fillna({'サービス内容/提案のわかりやすさ':0, 'コミュニケーション':0, 'クオリティ':0, '納期/スケジュール':0, 'rank_sample':0, '評価':0, '販売実績':0, '残り':0}, inplace=True)
    # 対応するデータ型に変換する。
    df['サービス内容/提案のわかりやすさ'] = df['サービス内容/提案のわかりやすさ'].astype(float)
    df['コミュニケーション'] = df['コミュニケーション'].astype(float)
    df['クオリティ'] = df['クオリティ'].astype(float)
    df['納期/スケジュール'] = df['納期/スケジュール'].astype(float)
    df['rank_sample'] = df['rank_sample'].astype(int)
    df['評価'] = df['評価'].astype(float)
    df['販売実績'] = df['販売実績'].astype(int)
    df['残り'] = df['残り'].astype(int)
    
    print("scores preprocessed!")


In [None]:
# test
df = df_auto.copy()
scores_preprocessing(df)
df

In [None]:
df.rank_sample.value_counts()

In [None]:
df[df['rank_sample'].isnull()]

## お届け日数

In [None]:
df = df_auto.copy()
df.お届け日数.unique()

In [None]:
# 最初に「/」で分けた後、予定と実績に分ける？
df['お届け日数'] = df['お届け日数'].astype(str)
df['お届け日数(予定)'] = df['お届け日数'].map(lambda x : x.split("/")[0] if "/" in x else x if '実績' not in x else 0)
df['お届け日数(実績)'] = df['お届け日数'].map(lambda x : x.split("/")[1] if "/" in x else x if '実績' in x else 0)

print(df['お届け日数(予定)'].unique())
print(df['お届け日数(実績)'].unique())

In [None]:
# 不要な文字を取り除く
df['お届け日数(予定)'] = df['お届け日数(予定)'].str.translate(str.maketrans({'\n':'', '日':'', '（':'', '）':'', '以':'', '内':'',
                                                                          ' ':'', '予':'', '定':'', '約':''}))
df['お届け日数(実績)'] = df['お届け日数(実績)'].str.translate(str.maketrans({'\n':'', '日':'', '（':'', '）':'', '以':'', '内':'',
                                                                          ' ':'', '実':'', '績':'', '約':''}))

print(df['お届け日数(予定)'].unique())
print(df['お届け日数(実績)'].unique())

In [None]:
# 「要相談」と「nan」は0に変換する
df['お届け日数(予定)'][df['お届け日数(予定)'] == '要相談'] = 0
df['お届け日数(予定)'][df['お届け日数(予定)'].astype(str) == 'nan'] = 0
df['お届け日数(実績)'][df['お届け日数(実績)'].astype(str) == 'nan'] = 0

print(df['お届け日数(予定)'].unique())
print(df['お届け日数(実績)'].unique())

In [None]:
# データ型を変換する
df['お届け日数(予定)'] = df['お届け日数(予定)'].astype(int)
df['お届け日数(実績)'] = df['お届け日数(実績)'].astype(int)

print(df['お届け日数(予定)'].unique())
print(df['お届け日数(実績)'].unique())

In [None]:
def delivery_preprocessing(df):
    # 最初に「/」で分けた後、予定と実績に分ける？
    df['お届け日数'] = df['お届け日数'].astype(str)
    df['お届け日数(予定)'] = df['お届け日数'].map(lambda x : x.split("/")[0] if "/" in x else x if '実績' not in x else 0)
    df['お届け日数(実績)'] = df['お届け日数'].map(lambda x : x.split("/")[1] if "/" in x else x if '実績' in x else 0)
    
    # 不要な文字を取り除く
    df['お届け日数(予定)'] = df['お届け日数(予定)'].str.translate(str.maketrans({'\n':'', '日':'', '（':'', '）':'', '以':'', '内':'',
                                                                              ' ':'', '予':'', '定':'', '約':''}))
    df['お届け日数(実績)'] = df['お届け日数(実績)'].str.translate(str.maketrans({'\n':'', '日':'', '（':'', '）':'', '以':'', '内':'',
                                                                              ' ':'', '実':'', '績':'', '約':''}))
    
    # 「要相談」と「nan」は0に変換する
    df['お届け日数(予定)'][df['お届け日数(予定)'] == '要相談'] = 0
    df['お届け日数(予定)'][df['お届け日数(予定)'].astype(str) == 'nan'] = 0
    df['お届け日数(実績)'][df['お届け日数(実績)'].astype(str) == 'nan'] = 0
    
    # データ型を変換する
    df['お届け日数(予定)'] = df['お届け日数(予定)'].astype(int)
    df['お届け日数(実績)'] = df['お届け日数(実績)'].astype(int)
    
    # お届け日数はいらないので削除する
    del df['お届け日数']
    
    # 特徴量エンジニアリングとして、お届け日数(予定-実績)を追加する
    df['お届け日数(予定-実績)'] = df['お届け日数(予定)'] - df['お届け日数(実績)']
    
    print("delivery preprocessed!")


In [None]:
# test
df = df_auto.copy()
delivery_preprocessing(df)
df

## 初回返答時間

In [None]:
df.初回返答時間.value_counts()

In [None]:
def responce_preprocessing(df):
    # 「24時間以上」と「24時間以内」の区別はつけない
    df['初回返答時間'] = df['初回返答時間'].str.translate(str.maketrans({'\n':'', ' ':'', '時':'', '間':'', '以':'', '内':'', '上':'', '（':'', '実':'', '績':'', '）':'', }))
    df['初回返答時間'].fillna(0, inplace=True)
    df['初回返答時間'] = df['初回返答時間'].astype(int)
    print("responce preprocessed!")


In [None]:
# test
df = df_auto.copy()
responce_preprocessing(df)
df

In [None]:
df.初回返答時間.unique()

## 提供形式

In [None]:
df.提供形式.value_counts()

In [None]:
def order_preprocessing(df):
    del df['提供形式']
    print("order preprocessed!")


In [None]:
# test
df = df_auto.copy()
order_preprocessing(df)
df

## データ前処理関数

In [None]:
def coconala_preprocessing(df, type_name):
    import warnings
    warnings.simplefilter('ignore')
    
    No_preprocessing(df)
    value_preprocessing(df)
    title_preprocessing(df)
    gender_preprocessing(df)
    type_preprocessing(df, type_name)
    pro_preprocessing(df)
    rank_preprocessing(df)
    achievements_preprocessing(df)
    scores_preprocessing(df)
    delivery_preprocessing(df)
    responce_preprocessing(df)
    order_preprocessing(df)
    
    print("ALL PREPROCESSING COMPLETED!!")
    

In [None]:
# test
df_final = df_auto.copy()
coconala_preprocessing(df_final, '開発作業自動化・効率化')
df_final