In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### data fields
    ID - an ID for this instance
    Area - (A), The area of a bean zone and the number of pixels within its boundaries.
    Perimeter - (P), Bean circumference is defined as the length of its border.
    MajorAxisLength - (L), The distance between the ends of the longest line that can be drawn from a bean.
    MinorAxisLength - (l), The longest line that can be drawn from the bean while standing perpendicular to the main axis.
    AspectRatio - (K), Defines the relationship between L and l.
    Eccentricity - (Ec), Eccentricity of the ellipse having the same moments as the region.
    ConvexArea - (C), Number of pixels in the smallest convex polygon that can contain the area of a bean seed.
    EquivDiameter - (Ed), The diameter of a circle having the same area as a bean seed area.
    Extent - (Ex), The ratio of the pixels in the bounding box to the bean area.
    Solidity - (S), Also known as convexity. The ratio of the pixels in the convex shell to those found in beans.
    Roundness - (R), Calculated with the following formula: (4piA)/(P^2)
    Compactness - (CO), Measures the roundness of an object: Ed/L
    ShapeFactor1 - (SF1)
    ShapeFactor2 - (SF2)
    ShapeFactor3 - (SF3)
    ShapeFactor4 - (SF4)
    y - the class of the bean. It can be any of BARBUNYA, SIRA, HOROZ, DERMASON, CALI, BOMBAY, and SEKER.


In [2]:
train_df= pd.read_csv(os.path.join(dirname, 'train.csv'))
test_df= pd.read_csv(os.path.join(dirname, 'test.csv'))
# info about df like type ..
train_df.head()

In [3]:
data6 = train_df[['ShapeFactor1', 'ShapeFactor2','ShapeFactor3', 'ShapeFactor4','y']]
data6.head()
sns.set_theme(style="whitegrid")
sns.pairplot(data6, hue="y")

In [4]:
train_df.boxplot(column=['ShapeFactor1', 'ShapeFactor2','ShapeFactor3', 'ShapeFactor4'])

In [5]:
def out_preprocessing(df):
    df['y'] = df['y'].map({'DERMASON':1, 'SIRA':2, 'SEKER':3 , 'HOROZ':4 ,'CALI':5 ,'BARBUNYA':6 , 'BOMBAY':7}) 
    return df

    
train_df = out_preprocessing(train_df)

train_df.head()

In [6]:
train_df.info()

In [7]:
train_df['y'].value_counts()

In [8]:
train_df = train_df.drop(columns = ['ID'])
fig, ax = plt.subplots(figsize = (16, 16))
sns.heatmap(train_df.corr(),cmap ='RdYlGn', linewidths = 0.30, annot = True)

In [9]:
train_df.columns

In [10]:
attributes = ['ShapeFactor1', 'ShapeFactor2',
       'ShapeFactor3', 'ShapeFactor4', 'y']

scatter_matrix(train_df[attributes], figsize=(16, 12))

In [11]:
from sklearn.model_selection  import train_test_split #import package to create the train and test dataset
from sklearn.linear_model import LogisticRegression #import package to perform Logistic Regression
from sklearn.ensemble import RandomForestClassifier #import package to perform Random Forest
from sklearn.ensemble import GradientBoostingClassifier #import package to perform Gradient Boosting
from sklearn.neighbors import KNeighborsClassifier #import package to perform k-NN classifier
from sklearn.metrics import precision_score, recall_score, f1_score #import metrics score to validate algorithms
from sklearn.metrics import confusion_matrix as CM #import the confusion matrix package to evaluate classification performance
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

In [12]:
X = train_df[['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength',
       'AspectRation', 'Eccentricity', 'ConvexArea', 'EquivDiameter', 'Extent',
       'Solidity', 'roundness', 'Compactness', 'ShapeFactor1', 'ShapeFactor2',
       'ShapeFactor3', 'ShapeFactor4']]
y = train_df['y']

In [13]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
X_train.head()

In [15]:
clf2 =  RandomForestClassifier(n_estimators=100).fit(X_train,y_train)
print(classification_report(y_val, clf2.predict(X_val)))
      
clf3 = GradientBoostingClassifier(max_depth=5).fit(X_train,y_train)
print(classification_report(y_val, clf3.predict(X_val)))

In [16]:
clf4 = make_pipeline(StandardScaler(), SVC(gamma='auto',kernel='rbf')).fit(X_train,y_train)
print(classification_report(y_val, clf4.predict(X_val)))

In [17]:
from sklearn.model_selection import cross_val_score
classifier = make_pipeline(StandardScaler(), SVC(gamma='auto',kernel='rbf'))
print(cross_val_score(classifier, X, y, cv=5))
print(confusion_matrix(y_val, clf4.predict(X_val)))

In [18]:
def out_sub(df):
    df['y'] = df['y'].map({1:'DERMASON', 2:'SIRA', 3:'SEKER' , 4:'HOROZ',5:'CALI', 6:'BARBUNYA', 7:'BOMBAY'}) 
    return df
X_test = test_df[['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength',
       'AspectRation', 'Eccentricity', 'ConvexArea', 'EquivDiameter', 'Extent',
       'Solidity', 'roundness', 'Compactness', 'ShapeFactor1', 'ShapeFactor2',
       'ShapeFactor3', 'ShapeFactor4']]


y_pred = clf4.predict(X_test)
test_df['y'] = y_pred
test_df = out_sub(test_df)
test_df.head()

In [19]:
test_df[['ID', 'y']].to_csv('/kaggle/working/submission.csv', index=False)