In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [5]:
PATH = "/content/drive/MyDrive/Progetto Manutenzione"

In [56]:
# Caricare e concatenare i file CSV (Case001.csv-Case177.csv)
data_frames = []
for i in range(1, 178):
    file_path = (PATH + f'/dataset/train/data/Case{i:03d}.csv')
    df = pd.read_csv(file_path)
    df['Case'] = i
    data_frames.append(df)

data = pd.concat(data_frames, ignore_index=True)
data

Unnamed: 0,TIME,P1,P2,P3,P4,P5,P6,P7,Case
0,0.000,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,1
1,0.001,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,1
2,0.002,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,1
3,0.003,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,1
4,0.004,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,1
...,...,...,...,...,...,...,...,...,...
212572,1.196,1.948734,1.912482,1.904362,1.835654,1.924899,1.857220,1.920020,177
212573,1.197,1.948823,1.899824,1.884360,1.825497,1.927513,1.846068,1.919081,177
212574,1.198,1.957784,1.911383,1.893740,1.859805,1.940953,1.861668,1.950562,177
212575,1.199,1.970451,1.950009,1.945417,1.913911,1.953648,1.835381,1.983321,177


In [57]:
column_names = ['Case', 'Spacecraft', 'Condition', 'SV1', 'SV2', 'SV3', 'SV4', 'BP1', 'BP2', 'BP3', 'BP4', 'BP5', 'BP6', 'BP7', 'BV1']

labels = pd.read_csv(PATH + '/dataset/train/labels.csv', names=column_names)

# Sostituisco "Normal" con 0, "Anomaly" e "Fault" con 1
labels['Condition'] = labels['Condition'].map({'Normal': 0, 'Anomaly': 1, 'Fault': 1})

# Sostituisco "No"/"Yes" con 0/1 nelle colonne da "BP1" a "BV1"
for col in ['BP1', 'BP2', 'BP3', 'BP4', 'BP5', 'BP6', 'BP7', 'BV1']:
    labels[col] = labels[col].map({'No': 0, 'Yes': 1})

labels

Unnamed: 0,Case,Spacecraft,Condition,SV1,SV2,SV3,SV4,BP1,BP2,BP3,BP4,BP5,BP6,BP7,BV1
0,1,1,0,100,100,100,100,0,0,0,0,0,0,0,0
1,2,1,0,100,100,100,100,0,0,0,0,0,0,0,0
2,3,1,0,100,100,100,100,0,0,0,0,0,0,0,0
3,4,1,0,100,100,100,100,0,0,0,0,0,0,0,0
4,5,1,0,100,100,100,100,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172,173,3,1,100,100,100,100,0,0,0,1,0,0,0,0
173,174,3,1,100,100,100,100,0,0,0,0,1,0,0,0
174,175,3,1,100,100,100,100,0,0,0,0,0,1,0,0
175,176,3,1,100,100,100,100,0,0,0,0,0,0,1,0


In [58]:
#Creazione dati di train

merged_data = pd.merge(data, labels, on="Case", how="left")

train_data = merged_data[["Case", "Condition", "P1", "P2", "P3", "P4", "P5", "P6", "P7"]]

train_data = pd.concat([train_data]*2, ignore_index=True)
train_data

Unnamed: 0,Case,Condition,P1,P2,P3,P4,P5,P6,P7
0,1,0,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000
1,1,0,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000
2,1,0,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000
3,1,0,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000
4,1,0,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000
...,...,...,...,...,...,...,...,...,...
425149,177,1,1.948734,1.912482,1.904362,1.835654,1.924899,1.857220,1.920020
425150,177,1,1.948823,1.899824,1.884360,1.825497,1.927513,1.846068,1.919081
425151,177,1,1.957784,1.911383,1.893740,1.859805,1.940953,1.861668,1.950562
425152,177,1,1.970451,1.950009,1.945417,1.913911,1.953648,1.835381,1.983321


In [59]:
features = ['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7']

X_train = train_data[features]
y_train = train_data["Condition"]

In [60]:
model_name = "RandomForest"
model = RandomForestClassifier(
    max_depth = None,
    min_samples_leaf = 1,
    min_samples_split = 2,
    n_estimators = 200,
    random_state=42)

print(f"Training {model_name}...")
model.fit(X_train, y_train)

print("DONE.")

Training RandomForest...
DONE.


In [61]:
#Creazione dati di test

# Caricare e concatenare i file CSV (Case178.csv-Case223.csv)
data_frames = []
for i in range(178, 223):
    file_path = (PATH + f'/dataset/test/data/Case{i:03d}.csv')
    df = pd.read_csv(file_path)
    df['Case'] = i
    data_frames.append(df)

test_data = pd.concat(data_frames, ignore_index=True)
test_data

Unnamed: 0,TIME,P1,P2,P3,P4,P5,P6,P7,Case
0,0.000,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,178
1,0.001,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,178
2,0.002,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,178
3,0.003,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,178
4,0.004,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,2.000000,178
...,...,...,...,...,...,...,...,...,...
54040,1.196,2.999206,2.985047,2.982110,2.970652,3.004048,3.009258,3.006424,222
54041,1.197,2.977775,2.952302,2.945279,2.933262,2.965413,2.964095,2.962946,222
54042,1.198,2.959111,2.930013,2.915038,2.926461,2.932936,2.921135,2.922863,222
54043,1.199,2.948195,2.921907,2.911228,2.918398,2.910006,2.895828,2.893770,222


In [67]:
# Preparazione dei dati di test
features = test_data[["P1","P2","P3","P4","P5","P6","P7"]]
case_col = test_data[["Case"]]

# Predizione con il modello addestrato
predictions = model.predict(features)
features['Prediction'] = predictions

# Unione dei DataFrame
df_unique = pd.concat([case_col, features], axis=1)

# Calcolo della moda per ogni 'Case'
prediction_result = df_unique.groupby('Case')['Prediction'].apply(lambda x: x.mode().iat[0]).reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['Prediction'] = predictions


In [68]:
prediction_result

Unnamed: 0,Case,Prediction
0,178,1
1,179,1
2,180,0
3,181,0
4,182,0
5,183,0
6,184,1
7,185,0
8,186,1
9,187,0
