In [1]:
import numpy as np
import pandas as pd
np.random.seed(42)
data_size=1000

data={
    'feature1':np.random.randn(data_size),
    'feature2':np.random.randn(data_size),
    'feature3':np.random.randn(data_size),
    'label':np.random.choice([0,1],data_size)
    
    
}

In [2]:
df=pd.DataFrame(data)

In [3]:
df.to_csv('large_malware_data.csv',index=False)

In [4]:
loaded_df=pd.read_csv('large_malware_data.csv')
print("intial dataframe:")
print(loaded_df.head())

intial dataframe:
   feature1  feature2  feature3  label
0  0.496714  1.399355 -0.675178      1
1 -0.138264  0.924634 -0.144519      1
2  0.647689  0.059630 -0.792420      1
3  1.523030 -0.646937 -0.307962      1
4 -0.234153  0.698223 -1.893615      0


In [5]:
from sklearn.preprocessing import StandardScaler

df.fillna(df.mean(),inplace=True)

features=df.drop('label',axis=1)
scaler=StandardScaler()
normalized_features=scaler.fit_transform(features)

In [6]:
preprocessed_df=pd.DataFrame(normalized_features,columns=features.columns)
preprocessed_df['label']=df['label']

In [7]:
preprocessed_csv_file_path='preprocessing_large_malware_data.csv'
preprocessed_df.to_csv(preprocessed_csv_file_path,index=False)

In [8]:
loaded_preprocessed_df=pd.read_csv(preprocessed_csv_file_path)
print("preprocessed dataframe:")
print(loaded_preprocessed_df.head())

preprocessed dataframe:
   feature1  feature2  feature3  label
0  0.487759  1.332576 -0.692816      1
1 -0.161022  0.856405 -0.152959      1
2  0.642015 -0.011240 -0.812090      1
3  1.536382 -0.719965 -0.319235      1
4 -0.258995  0.629303 -1.932372      0


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report
import joblib

In [17]:
df =loaded_preprocessed_df

In [18]:
df.head()

Unnamed: 0,feature1,feature2,feature3,label
0,0.487759,1.332576,-0.692816,1
1,-0.161022,0.856405,-0.152959,1
2,0.642015,-0.01124,-0.81209,1
3,1.536382,-0.719965,-0.319235,1
4,-0.258995,0.629303,-1.932372,0


In [19]:
X=df.drop('label',axis=1)
y=df['label']

In [20]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

In [21]:
model=RandomForestClassifier()

In [22]:
model.fit(X_train,y_train)

In [23]:
joblib.dump(model,'malware_model.pkl')

['malware_model.pkl']

In [24]:
y_pred=model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
report=classification_report(y_test,y_pred)

In [25]:
print(f'Accuracy:{accuracy}')

Accuracy:0.41


In [26]:
print(report)

              precision    recall  f1-score   support

           0       0.40      0.41      0.40        98
           1       0.42      0.41      0.42       102

    accuracy                           0.41       200
   macro avg       0.41      0.41      0.41       200
weighted avg       0.41      0.41      0.41       200



In [27]:
param_grid={
    'n_estimators':[100,200,300],
    'max_depth':[None,10,20,30]
}

In [28]:
from sklearn.model_selection import GridSearchCV
grid_search=GridSearchCV(estimator=model,param_grid=param_grid,cv=3,n_jobs=1,verbose=2)

In [29]:
grid_search.fit(X_train,y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] END ...................max_depth=None, n_estimators=100; total time=   0.2s
[CV] END ...................max_depth=None, n_estimators=100; total time=   0.2s
[CV] END ...................max_depth=None, n_estimators=100; total time=   0.2s
[CV] END ...................max_depth=None, n_estimators=200; total time=   0.5s
[CV] END ...................max_depth=None, n_estimators=200; total time=   0.5s
[CV] END ...................max_depth=None, n_estimators=200; total time=   0.5s
[CV] END ...................max_depth=None, n_estimators=300; total time=   0.7s
[CV] END ...................max_depth=None, n_estimators=300; total time=   0.7s
[CV] END ...................max_depth=None, n_estimators=300; total time=   0.7s
[CV] END .....................max_depth=10, n_estimators=100; total time=   0.2s
[CV] END .....................max_depth=10, n_estimators=100; total time=   0.2s
[CV] END .....................max_depth=10, n_es

In [30]:
best_model=grid_search.best_estimator_
y_pred_best=best_model.predict(X_test)
best_accuracy=accuracy_score(y_test,y_pred_best)
best_report=classification_report(y_test,y_pred_best)

In [31]:
print(f"Best Accuracy:{best_accuracy}")
print(best_report)

Best Accuracy:0.425
              precision    recall  f1-score   support

           0       0.42      0.44      0.43        98
           1       0.43      0.41      0.42       102

    accuracy                           0.42       200
   macro avg       0.43      0.43      0.42       200
weighted avg       0.43      0.42      0.42       200



In [32]:
from imblearn.over_sampling import SMOTE

In [33]:
smote=SMOTE(random_state=42)
X_resampled,y_resampled=smote.fit_resample(X,y)

In [34]:
final_model=RandomForestClassifier(bootstrap=True,max_depth=None,min_samples_leaf=1,min_samples_split=2,n_estimators=200,random_state=42)
final_model.fit(X_resampled,y_resampled)

In [35]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [36]:
y_pred=final_model.predict(X_test)
final_accuracy=accuracy_score(y_test,y_pred)
final_report=classification_report(y_test,y_pred)

In [37]:
print(f"Final Accuracy:{final_accuracy}")
print(final_report)

Final Accuracy:1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        97
           1       1.00      1.00      1.00       103

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

