In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.utils import resample
from scipy import ndimage
from scipy.ndimage import rotate
from scipy import stats
import matplotlib.pyplot as plt
import numpy as np


### **Analyse des métadonnées**

In [None]:
df = pd.read_csv('metadataTrain.csv')
df.shape[0]
df.head()

In [None]:
# Remplacer les valeurs manquantes dans la colonne SEX par la valeur male
df['SEX'].fillna('male', inplace=True)

# Remplacer les valeurs manquantes dans la colonne 'AGE' par la moyenne des âges
df['AGE'].fillna(df['AGE'].mean(), inplace=True)

# Remplacer les valeurs manquantes dans la colonne 'position' par 'unknown'
df['POSITION'].fillna('unknown', inplace=True)

# on vérifie qu'il y a bien aucune valeur Nan.
df.isna().sum()

In [None]:
##Encode Nan
le = LabelEncoder()
# Conversion des valeurs 'male' et 'female' en 1 et 0 respectivement
df['SEX'] = le.fit_transform(df['SEX'])

ohe = OneHotEncoder()
position_encoded = ohe.fit_transform(df[['POSITION']]).toarray()
# Création de nouvelles colonnes pour chaque position unique
for i, category in enumerate(ohe.categories_[0]):
    df[category] = position_encoded[:, i]
# Suppression de la colonne 'position' originale
df = df.drop('POSITION', axis=1)
df.head()

In [None]:
# plot distributions

fig,ax = plt.subplots(1,2,figsize=(10,5))
ax[0].hist(df['AGE'], bins=20, color='b', alpha=0.7, rwidth=0.85)
ax[0].set_title('Age Distribution')
ax[0].set_xlabel('Age')
ax[0].set_ylabel('Frequency')

ax[1].hist(df['CLASS'], bins=20, color='r', alpha=0.7, rwidth=0.85)
ax[1].set_title('Class Distribution')
ax[1].set_xlabel('Class')
ax[1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

### **machine learning**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC, SVC
from imblearn.pipeline import make_pipeline as make_pipeline2
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
from imblearn.over_sampling import ADASYN
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import ADASYN
import pandas as pd

In [None]:
train_data = pd.read_csv('features_train.csv')
X = train_data.drop(['ID', 'CLASS'], axis=1)
y = train_data['CLASS']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
param_grid = {
    'n_estimators': [500, 1000, 1500],
    'min_samples_leaf': [2, 5, 10],
}
RF = RandomForestClassifier()
grid_search = GridSearchCV(estimator=RF, param_grid=param_grid, cv=3, n_jobs=-1, verbose=4)
grid_search.fit(X_train, y_train)
y_pred = grid_search.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(7, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Matrice de confusion')   
plt.ylabel('Vraies classes')
plt.xlabel('Classes prédites')
plt.show()

In [None]:
train_data = pd.read_csv('features_train_not_upsampled.csv')
X = train_data.drop(['ID', 'CLASS'], axis=1)
y = train_data['CLASS']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


RF = make_pipeline2(ADASYN(random_state=0),StandardScaler(),RandomForestClassifier(n_estimators=500)) 
RF.fit(X_train, y_train)
y_pred = RF.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))

test_data = pd.read_csv('features_test.csv')
X_test = test_data.drop(['ID'], axis=1)

Y_pred = RF.predict(X_test)
test_data['CLASS'] = Y_pred
new_submission = test_data[['ID', 'CLASS']]
new_submission.to_csv('nouvelle_submission_RF_adasyn.csv', index=False)

In [1]:
### fit a distribution
# Assuming df is your DataFrame and 'feature' is your column of interest
plt.hist(df['feature'], bins=30)
plt.xlabel('Feature')
plt.ylabel('Frequency')
plt.title('Histogram of Feature')
plt.show()

In [None]:
import seaborn as sns

# Assuming df is your DataFrame and 'feature' is your column of interest
sns.kdeplot(df['feature'])
plt.xlabel('Feature')
plt.title('Kernel Density Estimation of Feature')
plt.show()

In [None]:
from scipy.stats import norm
# Assuming data is your feature
mu, std = norm.fit(data)

# Plot the histogram
plt.hist(data, bins=30, density=True, alpha=0.6, color='g')

# Plot the PDF
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
plt.plot(x, p, 'k', linewidth=2)
title = "Fit results: mu = %.2f,  std = %.2f" % (mu, std)
plt.title(title)

plt.show()

In [None]:
#### transport 1d
def transport1D(X,Y):
    sx = np.argsort(X) #argsort retourne les indices des valeurs s'ils étaient ordonnés par ordre croissant   
    sy = np.argsort(Y)
    return((sx,sy)) 