In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [1]:
from sklearn import linear_model

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error

In [5]:
pop_male = pd.read_csv('1인가구_남자.csv', encoding='cp949', index_col=0)
pop_female = pd.read_csv('1인가구_여자.csv', encoding='cp949', index_col=0)
crime = pd.read_csv('년,월별 총 범죄데이터.csv', encoding='cp949', index_col=0)
crime_rate = pd.read_csv('년도별 범죄율 데이터만.csv', encoding='cp949', index_col=0)
pop = pd.read_csv('자치구별 주민등록인구.csv', encoding='cp949', index_col=0)

In [6]:
df = pd.concat([pop_male, pop_female, crime, crime_rate, pop], axis=1)
df.to_csv('crime_pop_all.csv', encoding='cp949')

## 나이브베이즈 분석

In [7]:
X = np.array(pd.DataFrame(df, columns = ['2020_30대_남자', '2021_30대_남자', '2022_30대_남자', '2020_30대_여자', '2021_30대_여자', '2022_30대_여자', '2020_40대_남자', '2021_40대_남자', '2022_40대_남자', '2020_40대_여자', '2021_40대_여자', '2022_40대_여자', '2022_50대_여자']))
y = np.array(pd.DataFrame(df, columns = ['총범죄건수']))

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

print('X_train :', X_train)
print('X_test :', X_test)
print('y_train : ', y_train)
print('y_test :', y_test)

X_train : [[ 5287  5806  6207  4373  4855  5409  3012  3105  3216  2520  2625  2692
   2071]
 [ 5918  6431  6864  4141  4531  4972  4000  4097  4125  2538  2633  2673
   2653]
 [10181 10501 10686 11286 11137 11018  5750  6018  6172  5804  6178  6384
   3806]
 [ 4873  5298  5394  3844  4072  4272  3196  3237  3228  2458  2547  2603
   2266]
 [ 6645  7304  7999  4593  5050  5559  4490  4575  4677  2779  2830  2970
   2924]
 [ 2878  3256  3440  2085  2347  2548  2783  2907  2888  2066  2139  2217
   2958]
 [ 8398  9099  9606  8562  9049  9495  4291  4408  4599  4278  4443  4592
   3109]
 [ 5951  6380  6822  4440  4838  5371  4842  4828  4857  3062  3251  3332
   4057]
 [ 7779  8395  9126  6904  7407  7989  4310  4438  4413  3258  3373  3579
   3145]
 [ 8266  8968  9519  6111  6735  7057  4031  4269  4316  2897  2987  3178
   2731]
 [ 6026  6713  6974  4755  5482  5891  4357  4572  4557  3172  3412  3571
   3405]
 [ 2533  2806  3064  2140  2462  2629  1713  1749  1742  1280  1344  1413
   

In [14]:
gnb_clf = GaussianNB()
gnb_clf = gnb_clf.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [15]:
gnb_prediction = gnb_clf.predict(X_test)

In [16]:
fmeasure = round(f1_score(y_test, gnb_prediction, average = 'weighted'), 2)

accuracy = round(accuracy_score(y_test, gnb_prediction, normalize = True), 2)

In [17]:
data_nbclf = pd.DataFrame(columns=['Classifier', 'F-Measure', 'Accuracy'])

data_nbclf.loc[len(data_nbclf)] = ['Naive Bayes', fmeasure, accuracy]

data_nbclf

Unnamed: 0,Classifier,F-Measure,Accuracy
0,Naive Bayes,0.0,0.0


## Decision Tree

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [35]:
regressor = DecisionTreeRegressor()
regressor.fit(X_train, y_train)

# 예측
y_pred = regressor.predict(X_test)

# 회귀 평가 메트릭 계산
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 3556251.9


In [36]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=10, shuffle=True, random_state=42)

for train_index, test_index in kf.split(X, y):
    print('Train set: ', train_index)
    print('Test set: ', test_index)

Train set:  [ 1  2  3  4  5  6  7  9 10 11 12 13 14 15 17 18 19 20 21 22 23 24]
Test set:  [ 0  8 16]
Train set:  [ 0  1  2  3  4  5  6  7  8 10 12 13 14 15 16 17 18 19 20 21 22 24]
Test set:  [ 9 11 23]
Train set:  [ 0  2  3  4  5  6  7  8  9 10 11 12 14 15 16 17 18 19 20 21 23 24]
Test set:  [ 1 13 22]
Train set:  [ 0  1  3  4  6  7  8  9 10 11 13 14 15 16 17 18 19 20 21 22 23 24]
Test set:  [ 2  5 12]
Train set:  [ 0  1  2  5  6  7  8  9 10 11 12 13 14 16 17 18 19 20 21 22 23 24]
Test set:  [ 3  4 15]
Train set:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 18 19 21 22 23 24]
Test set:  [17 20]
Train set:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 19 20 22 23 24]
Test set:  [18 21]
Train set:  [ 0  1  2  3  4  5  6  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
Test set:  [ 7 24]
Train set:  [ 0  1  2  3  4  5  6  7  8  9 11 12 13 15 16 17 18 19 20 21 22 23 24]
Test set:  [10 14]
Train set:  [ 0  1  2  3  4  5  7  8  9 10 11 12 13 14 15 16 17 18 20 21 22 23 24]

In [40]:
clf = DecisionTreeRegressor()
scores = cross_val_score(clf, X, y, cv=kf, scoring='neg_mean_squared_error')
print('K Fold Cross Validation Score')
print(scores)
print('Average Accuracy')
print(scores.mean())

K Fold Cross Validation Score
[-37078019.66666666  -2447545.33333333   -558520.
  -8251116.33333333  -2580949.66666667  -3402037.
   -391208.5         -1139742.5         -4574889.
  -7731026.        ]
Average Accuracy
-6815505.4


In [41]:
y_pred = clf.predict(X_test)

NotFittedError: This DecisionTreeRegressor instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [19]:
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))

Confusion Matrix
[[0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 1 0 0]]


In [38]:
print('Accuracy')
print(accuracy_score(y_test, y_pred, normalize=True))

Accuracy
0.0


In [39]:
print('Classification Report')
print(classification_report(y_test, y_pred))

Classification Report
              precision    recall  f1-score   support

      7395.0       0.00      0.00      0.00       0.0
      7758.0       0.00      0.00      0.00       1.0
      7793.0       0.00      0.00      0.00       0.0
      7946.0       0.00      0.00      0.00       1.0
      8449.0       0.00      0.00      0.00       0.0
      8677.0       0.00      0.00      0.00       1.0
      8852.0       0.00      0.00      0.00       1.0
      9142.0       0.00      0.00      0.00       0.0
      9154.0       0.00      0.00      0.00       1.0
      9173.0       0.00      0.00      0.00       0.0
      9791.0       0.00      0.00      0.00       1.0
      9926.0       0.00      0.00      0.00       0.0
     10105.0       0.00      0.00      0.00       1.0
     10238.0       0.00      0.00      0.00       1.0
     10697.0       0.00      0.00      0.00       1.0
     15204.0       0.00      0.00      0.00       0.0
     20117.0       0.00      0.00      0.00       1.0

    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
skf = StratifiedKFold(n_splits=10)
skf.get_n_splits(X, y)
print(skf)

StratifiedKFold(n_splits=10, random_state=None, shuffle=False)


In [25]:
str.count(sub[, start[, end]])

SyntaxError: invalid syntax (364259391.py, line 1)