# Tic Tac Toe dataset
Simple demostration of random forest using the tic-tac dataset which is provides all possible outcomes of a tic-tac game given inputs are all the position of players, including empty boxes(where the player wins with min plays).
Using 80% train data, we'll predict winner for the remaing 20% using random forest algorithm. 

In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
seed=42

# Loading Data

In [2]:
columns = ['top-left-sq','top-middle-sq','top-right-sq','middle-left-sq','middle-middle-sq','middle-right-sq',
            'bottom-left-sq','bottom-middle-sq','bottom-right-sq','winner']
data = pd.read_csv("./datasets/clean_tic_tac_toe.csv",names=columns)

In [3]:
data.head(10)

Unnamed: 0,top-left-sq,top-middle-sq,top-right-sq,middle-left-sq,middle-middle-sq,middle-right-sq,bottom-left-sq,bottom-middle-sq,bottom-right-sq,winner
0,x,x,x,x,o,o,x,o,o,positive
1,x,x,x,x,o,o,o,x,o,positive
2,x,x,x,x,o,o,o,o,x,positive
3,x,x,x,x,o,o,o,b,b,positive
4,x,x,x,x,o,o,b,o,b,positive
5,x,x,x,x,o,o,b,b,o,positive
6,x,x,x,x,o,b,o,o,b,positive
7,x,x,x,x,o,b,o,b,o,positive
8,x,x,x,x,o,b,b,o,o,positive
9,x,x,x,x,b,o,o,o,b,positive


In [4]:
data.shape

(958, 10)

# Cleaning/Transforming Data

As algos can only process data if its numeric.
So Lets make the x,o,b more readeable for pandas

In [5]:
#making categoral to numerical values.
features = data.columns
for feature in features:
    data[feature] = data[feature].replace("x", "1")
    data[feature] = data[feature].replace("o", "2")
    data[feature] = data[feature].replace("b", "0")

data['winner'] = data['winner'].replace("positive","X")    
data['winner'] = data['winner'].replace("negative","O")     

In [6]:
data['winner'].unique()

array(['X', 'O'], dtype=object)

In [7]:
data.head(10)

Unnamed: 0,top-left-sq,top-middle-sq,top-right-sq,middle-left-sq,middle-middle-sq,middle-right-sq,bottom-left-sq,bottom-middle-sq,bottom-right-sq,winner
0,1,1,1,1,2,2,1,2,2,X
1,1,1,1,1,2,2,2,1,2,X
2,1,1,1,1,2,2,2,2,1,X
3,1,1,1,1,2,2,2,0,0,X
4,1,1,1,1,2,2,0,2,0,X
5,1,1,1,1,2,2,0,0,2,X
6,1,1,1,1,2,0,2,2,0,X
7,1,1,1,1,2,0,2,0,2,X
8,1,1,1,1,2,0,0,2,2,X
9,1,1,1,1,0,2,2,2,0,X


In [8]:
#times wins per
print("X wins",data["winner"].value_counts()[1],"times")
print("O wins",data["winner"].value_counts()[0],"times")

X wins 332 times
O wins 626 times


# Creating train/test data

In [9]:
X = data.drop(columns=['winner']).astype(int)
Y = data['winner']

In [10]:
# train and test set
X_train, X_test, y_train, y_test = train_test_split(X,Y,random_state=seed,test_size=0.20)

# Model Training and predicting

In [12]:
#using random forest
from sklearn.model_selection import GridSearchCV
rfc = RandomForestClassifier(random_state= 42,oob_score=False)

# max_fea = np.arange(1,9)
params =  {"n_estimators": [350,450,500], 
           "max_depth": [10,9],
            "max_features" : [0.6,0.5],
           "min_samples_split" : [4]
          } 

# finding best parameters 
gscv = GridSearchCV(estimator = rfc, param_grid = params, cv = 10, n_jobs =-1)
gscv.fit(X_train,y_train)

print("GSCV Score: %.2f%% "% gscv.score(X_test,y_test))
gscv.best_params_ 

GSCV Score: 0.97% 


{'max_depth': 9,
 'max_features': 0.5,
 'min_samples_split': 4,
 'n_estimators': 350}

In [13]:
# Final randomforest model
rf_final = RandomForestClassifier(random_state= 42,oob_score=False,max_depth = 9,
      max_features = 0.5,min_samples_split = 4,n_estimators = 350)
cv_score = model_selection.cross_val_score(rf_final, X_train, y_train, cv=10, scoring=scoring)
print("CV_Sc% .2f%% " % cv_score.mean())

 0.98% 


In [16]:
# manually validating
rf_final.fit(X_train,y_train)
rf_final.predict([[1,1,1,1,2,0,2,2,0]])

array(['X'], dtype=object)