## 載入套件及檔案

In [None]:
!pip install seaborn

In [None]:
!pip install mlxtend

In [None]:
import pandas as pd
import numpy as np
import os
import re
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
pd.set_option('display.max_rows', 800)

In [None]:
df = pd.read_csv('add_simple.csv')

In [None]:
df.head(500)

## 資料清理

In [None]:
type(df['xword'][5])

In [None]:
for i in range(len(df)):
    for j in range(0,17):
        df.iloc[i,j] = str(df.iloc[i,j]).replace("[", '').replace("]","").replace("'","").replace(" ","").replace('\n','')

In [None]:
for i in range(len(df)):
    df.iloc[i,15] = str(df.iloc[i,15]).replace("公然侮辱_1", '309_1').replace("公然侮辱_2","309_2").replace("誹謗_1","310_1").replace("誹謗_2","310_2")

In [None]:
df[['years', 'label']]=df[['years', 'label']].astype(int)

In [None]:
for i in range(len(df)):
    for j in range(0,17):
        if df.iloc[i,j] == 'nan':
            df.iloc[i,j] = None

## 特徵工程

In [None]:
df.info()

In [None]:
df['simplejudge'].value_counts()

In [None]:
df['court'].value_counts()

In [None]:
df.groupby(['law'])['label'].value_counts()

In [None]:
df.groupby(['law'])['xword'].value_counts()

In [None]:
df.groupby(['law'])['simplejudge'].value_counts()

In [None]:
df.groupby(['law','simplejudge'])['label'].describe().applymap(lambda x: int(x))

In [None]:
df.groupby(['law'])['label'].describe().applymap(lambda x: int(x))

In [None]:
pd.crosstab(df['label'],df['law']).T.style.background_gradient(cmap='summer_r')

In [None]:
g = sns.FacetGrid(df, col='law', col_wrap=1, aspect=3)
g.map(sns.histplot,'label',kde=False)

In [None]:
mask = df['law']=='309_1'
pos = np.flatnonzero(mask)
df309_1 = df.iloc[pos]
# g = sns.FacetGrid(df309_1, col='simplejudge', col_wrap=1, aspect=3)
# g.map(sns.histplot,'label',kde=False)
sns.histplot(df309_1['label'],kde=False)

In [None]:
mask = df['law']=='309_2'
pos = np.flatnonzero(mask)
df309_2 = df.iloc[pos]
# g = sns.FacetGrid(df309_2, col='simplejudge', col_wrap=1, aspect=3)
# g.map(sns.histplot,'label',kde=False)
sns.histplot(df309_2['label'],kde=False)

In [None]:
mask = df['law']=='310_1'
pos = np.flatnonzero(mask)
df310_1 = df.iloc[pos]
# g = sns.FacetGrid(df310_1, col='simplejudge', col_wrap=1, aspect=3)
# g.map(sns.histplot,'label',kde=False)
sns.histplot(df310_1['label'],kde=False)

In [None]:
mask = df['law']=='310_2'
pos = np.flatnonzero(mask)
df310_2 = df.iloc[pos]
# g = sns.FacetGrid(df310_2, col='simplejudge', col_wrap=1, aspect=3)
# g.map(sns.histplot,'label',kde=False)
sns.histplot(df310_2['label'],kde=False)

In [None]:
plt.figure(figsize=(25,6))
sns.countplot(df['label'],hue=df['law'])

## 刪補空值

In [None]:
df.info()

In [None]:
mask = (df['court'].isnull())
df[mask]

In [None]:
df['court'].fillna(value='市區', inplace=True)
df['record'].fillna(value='無', inplace=True)
df['place'].fillna(value='實體', inplace=True)
df['compromise'].fillna(value='無', inplace=True)
# df['xword'].fillna(value='無', inplace=True)
df['education'].fillna(value='高中職', inplace=True)
df['mind'].fillna(value='正常', inplace=True)
df['financial'].fillna(value='正常', inplace=True)
df['support'].fillna(value='無', inplace=True)
df['attitude'].fillna(value='尚可', inplace=True)
df['confess'].fillna(value='否', inplace=True)
df['sequel'].fillna(value='否', inplace=True)

In [None]:
df.info()

In [None]:
df = df.dropna(axis=0, how='any')

## 離群值處理

In [None]:
# 考慮最後應該會以分類模型作為學習，因此不採取log或平方方式
print ("Shape Of The Before Ouliers: ",df309_1.shape)
n=1.5
#IQR = Q3-Q1
IQR = np.percentile(df309_1['label'],75) - np.percentile(df309_1['label'],25)
#outlier = Q3 + n*IQR 
df309_1=df309_1[df309_1['label'] < np.percentile(df309_1['label'],75)+n*IQR]
#outlier = Q1 - n*IQR 
df309_1=df309_1[df309_1['label'] > np.percentile(df309_1['label'],25)-n*IQR]
print ("Shape Of The After Ouliers: ",df309_1.shape)

In [None]:
sns.histplot(df309_1['label'],kde=False)

In [None]:
df309_1['label'].value_counts()

## 轉碼

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
labelencoder = LabelEncoder() 
df2 = df.copy(deep=True)
df2['simplejudge'] = labelencoder.fit_transform(df2['simplejudge']) 
df2['court'] = labelencoder.fit_transform(df2['court']) 
df2['record'] = labelencoder.fit_transform(df2['record']) 
df2['place'] = labelencoder.fit_transform(df2['place']) 
df2['compromise'] = labelencoder.fit_transform(df2['compromise']) 
df2['xword'] = labelencoder.fit_transform(df2['xword']) 
df2['education'] = labelencoder.fit_transform(df2['education']) 
df2['mind'] = labelencoder.fit_transform(df2['mind']) 
df2['financial'] = labelencoder.fit_transform(df2['financial']) 
df2['support'] = labelencoder.fit_transform(df2['support']) 
df2['attitude'] = labelencoder.fit_transform(df2['attitude']) 
df2['confess'] = labelencoder.fit_transform(df2['confess']) 
df2['law'] = labelencoder.fit_transform(df2['law'])
df2['sequel'] = labelencoder.fit_transform(df2['sequel']) 

In [None]:
df2.head(500)

## 切分資料集

In [None]:
# 全資料
X = df2.drop(['label','id','years'],axis=1)
y = df2['label']

In [None]:
# 309-1
law = df2.groupby('law')
df_insult1 = law.get_group(0)
# df_insult1 = df_insult1.sample(200)
X1 = df_insult1.drop(['label','id','years','law'],axis=1)
y1 = df_insult1['label']

In [None]:
# 309-2
law = df2.groupby('law')
df_insult2 = law.get_group(1)
X2 = df_insult2.drop(['label','id','years','law'],axis=1)
y2 = df_insult2['label']

In [None]:
# 310-1
law = df2.groupby('law')
df_insult3 = law.get_group(2)
# df_insult3 = df_insult3.sample(200)
X3 = df_insult3.drop(['label','id','years','law'],axis=1)
y3 = df_insult3['label']

In [None]:
# 310-2
law = df2.groupby('law')
df_insult4 = law.get_group(3)
# df_insult4 = df_insult4.sample(200)
X4 = df_insult4.drop(['label','id','years','law'],axis=1)
y4 = df_insult4['label']

## Dimension Reduction (MDS、PCA、Isomap)

In [None]:
from sklearn import preprocessing

In [None]:
X_test=X1
y_test=y1

In [None]:
scaler = preprocessing.StandardScaler().fit(X_test)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.manifold import MDS
mds=MDS(n_components=2) 
mds.fit(X_test)
X_mds=mds.fit_transform(X_test)
plt.scatter(X_mds[:,0],X_mds[:,1],c=y_test,alpha=.5)
plt.colorbar()
plt.title('Using sklearn MDS')

In [None]:
from sklearn.decomposition import PCA
pca=PCA(n_components=2)
pca.fit(X_test)
X_PCA=pca.fit_transform(X_test)
plt.scatter(X_PCA[:,0],X_PCA[:,1],c=y_test,alpha=.5)
plt.colorbar ()
plt.title('Using sklearn PCA')

In [None]:
from sklearn.manifold import Isomap
iso=Isomap(n_components=2)
iso.fit(X_test)
X_iso=iso.transform(X_test)
plt.scatter(X_iso[:,0],X_iso[:,1],c=y_test,alpha=.5)
plt.colorbar()
plt.title('Using sklearn Isomap')

## Stepwise Regression

In [None]:
from sklearn.feature_selection import RFE
from sklearn import datasets, linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors
# estimator =  neighbors.KNeighborsClassifier(algorithm = 'brute', n_neighbors = 3, weights = 'distance', p = 1)
estimator = RandomForestClassifier(max_depth=6, n_estimators=10)
# estimator = linear_model.LinearRegression()
selector = RFE(estimator, n_features_to_select=1, step=1)
selector = selector.fit(X_test, y_test)
print(selector.support_)
print(selector.ranking_)
for i in selector.ranking_:
    i = i-1
    print(X1.columns[i])