In [None]:
%pip install torch numpy pandas fastai

RANDOM FOREST

In [None]:
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv('./Titanic/train.csv')
tst_df = pd.read_csv('./Titanic/test.csv')
modes = df.mode().iloc[0]

def proc_data(df):
    df['Fare'] = df.Fare.fillna(0)
    df.fillna(modes, inplace=True)
    df['LogFare'] = np.log1p(df['Fare'])
    df['Embarked'] = pd.Categorical(df.Embarked)
    df['Sex'] = pd.Categorical(df.Sex)
    

proc_data(df)
proc_data(tst_df)

random.seed(42)
trn_df,val_df = train_test_split(df,test_size=0.2)

cats=['Sex','Embarked']
conts=['Age','SibSp','Parch','LogFare','Pclass']
dep="Survived"

In [None]:
cols = cats + conts
cols.remove("Sex")
ismale = trn_df.Sex == 1
males,females = trn_df[ismale],trn_df[~ismale]

In [None]:
trn_df[cats] = trn_df[cats].apply(lambda x: x.cat.codes)
val_df[cats] = val_df[cats].apply(lambda x: x.cat.codes)
def xs_y(df):
    xs = df[cats+conts].copy()
    return xs,df[dep] if dep in df else None

trn_xs,trn_y = xs_y(trn_df)
val_xs,val_y = xs_y(val_df)

In [None]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz

m = DecisionTreeClassifier(max_leaf_nodes=4).fit(trn_xs,trn_y)

In [None]:
import re
import graphviz

def draw_tree(t, df, size=10, ratio=0.6, precision=0):
    """ Draws a representation of a random forest in IPython.

    Parameters:
    -----------
    t: The tree you wish to draw
    df: The data used to train the tree. This is used to get the names of the features.
    """
    s=export_graphviz(t, out_file=None, feature_names=df.columns, filled=True,
                      special_characters=True, rotate=False, precision=precision)
    return graphviz.Source(re.sub('Tree {',
       f'Tree {{ size={size}; ratio={ratio}', s))
    
draw_tree(m, trn_xs, precision=2)

In [None]:
m = DecisionTreeClassifier(min_samples_leaf=50).fit(trn_xs,trn_y)
draw_tree(m, trn_xs, size=25)

In [None]:
tst_df[cats] = tst_df[cats].apply(lambda x: x.cat.codes)
tst_xs,_ = xs_y(tst_df)

def subm(preds,suff):
    tst_df['Survived'] = preds
    tst_df[['PassengerId','Survived']].to_csv(f'subm_{suff}.csv',index=False)
    sub_df = tst_df[['PassengerId','Survived']]
    sub_df.to_csv(f'subm_{suff}.csv',index=False)
    
subm(m.predict(tst_xs),'tree')

In [None]:
from sklearn.metrics import mean_absolute_error
def get_tree(prop=0.75):
    n = len(trn_y)
    idxs = random.sample(range(n), int(n*prop))
    return DecisionTreeClassifier(min_samples_leaf=5).fit(trn_xs.iloc[idxs],trn_y.iloc[idxs])

trees = [get_tree() for i in range(100)]

all_props = [t.predict(val_xs) for t in trees]
avg_probs = np.stack(all_props).mean(0)

mean_absolute_error(val_y,avg_probs)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(100,min_samples_leaf=5).fit(trn_xs,trn_y)
mean_absolute_error(val_y,rf.predict(val_xs))

In [None]:
pd.DataFrame(dict(cols=trn_xs.columns,imp=rf.feature_importances_)).plot('cols','imp','barh')

In [None]:
import matplotlib.pyplot as plt
from sklearn.inspection import plot_partial_dependence
fig,ax = plt.subplots(figsize=(12,4))
plot_partial_dependence(rf,val_xs,['Pclass','LogFare'],ax=ax)

In [None]:
%pip install treeinterpreter
%pip install waterfallcharts

In [None]:
from treeinterpreter import treeinterpreter
row = val_xs.iloc[:5]

prediction,bias,contributions= treeinterpreter.predict(rf,row.values)

In [None]:
prediction[0], bias[0], contributions[0].sum()

In [None]:
from waterfall_chart import plot as waterfall
print(val_xs.columns.values)
waterfall(val_xs.columns,contributions[0][:, 0], threshold=0.08,rotation_value=45,formatting='{:,.3f}')

PADDY DISEASE CLASSIFICATION

In [None]:

from pathlib import Path
from fastai.vision.all import *

path = Path("./paddy-disease-classification")
path.ls()

In [None]:
trn_path = path/'train_images'
files = get_image_files(trn_path)

In [None]:
img = PILImage.create(files[0])
print(img.size)
img.to_thumb(128)

In [None]:
from fastcore.parallel import *

def f(o): return PILImage.create(o).size
sizes = parallel(f,files,n_workers=12)
pd.Series(sizes).value_counts()

In [None]:
dls = ImageDataLoaders.from_folder(trn_path, valid_pct=0.2, seed=42, item_tfms=Resize(480,method='squish'),batch_tfms=aug_transforms(size=128,min_scale=0.75))

In [None]:
dls.show_batch(max_n=6)

In [None]:
%pip install --upgrade timm  huggingface_hub

In [None]:
from huggingface_hub import *
learn = vision_learner(dls, 'resnet26d',metrics=error_rate,path='.').to_fp16()

In [None]:
learn.lr_find(suggest_funcs=(valley,slide))

In [None]:
learn.fine_tune(3,0.01)

In [None]:
ss = pd.read_csv(path/'sample_submission.csv')
ss

In [None]:
tst_files = get_image_files(path/'test_images').sorted()
tst_dl = dls.test_dl(tst_files)

In [None]:
probs,_,idxs= learn.get_preds(dl=tst_dl,with_decoded=True)
idxs

In [None]:
dls.vocab

In [None]:
mapping = dict(enumerate(dls.vocab))
results = pd.Series(idxs.numpy(),name="idxs").map(mapping)
results

In [None]:
ss['label'] = results
ss

In [None]:
ss.to_csv('submission.csv',index=False)

In [None]:
iskaggle= False
if not iskaggle:
    from kaggle import api
    api.competition_submit('submission.csv','initial rn26d','paddy-disease-classification')

In [None]:
trn_path = Path('sml')

resize_images(path/'train_images', dest=trn_path,max_size=256,recurse=True)

In [None]:
dls= ImageDataLoaders.from_folder(trn_path, valid_pct=0.2, seed=42, item_tfms=Resize(256,method='squish'),batch_tfms=aug_transforms(size=128,min_scale=0.75))

In [None]:
def train(arch,item,batch,epochs = 5):
    dls = ImageDataLoaders.from_folder(trn_path, valid_pct=0.2, seed=42, item_tfms=item,batch_tfms=batch)
    learn = vision_learner(dls, arch,metrics=error_rate,path='.').to_fp16()
    learn.fine_tune(epochs,0.01)
    return learn

learn = train('resnet26d',Resize(192,method='squish'),aug_transforms(size=128,min_scale=0.75),3)

In [None]:
arch = 'convnext_small_in22k'

learn = train(arch,Resize(192),aug_transforms(size=128,min_scale=0.75))