#### Importing all the required libraries

In [17]:
import os
import pandas as pd
import numpy as np
import random
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Define the path to the file
url = r'C:\Users\Sai keerthan\Downloads\malware_detection_url\Malware-Detection-using-Machine-learning-main\Dataset\data_url.csv'

# Check if the file exists
if not os.path.exists(url):
    print(f"Error: The file was not found at {url}")
else:
    # Load the CSV file
    url_df = pd.read_csv(url, delimiter=',', on_bad_lines='skip')

    # Confirm the DataFrame structure
    print(url_df.head())

    # Extract URLs and labels explicitly by column name
    urls = url_df['url'].tolist()
    y = url_df['label'].tolist()

    # Verify unique labels
    print("Unique labels:", set(y))

    def sanitization(web):
        web = web.lower()
        token = []
        dot_token_slash = []
        raw_slash = str(web).split('/')
        for i in raw_slash:
            # removing slash to get token
            raw1 = str(i).split('-')
            slash_token = []
            for j in range(0, len(raw1)):
                # removing dot to get the tokens
                raw2 = str(raw1[j]).split('.')
                slash_token = slash_token + raw2
            dot_token_slash = dot_token_slash + raw1 + slash_token
        # to remove same words
        token = list(set(dot_token_slash))  
        if 'com' in token:
            #remove com
            token.remove('com')
        return token

    # term-frequency and inverse-document-frequency
    vectorizer = TfidfVectorizer(tokenizer=sanitization)
    x = vectorizer.fit_transform(urls)
    
    # Use stratified split to maintain class balance
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
    
    # Train the Logistic Regression model
    lgr = LogisticRegression(solver='lbfgs', max_iter=1000)
    lgr.fit(x_train, y_train)
    
    # Calculate and print the score
    score = lgr.score(x_test, y_test)
    print("Score: {0:.2f} %".format(100 * score))

    # Save the vectorizer if needed
    vectorizer_save = vectorizer

                      url label
0  diaryofagameaddict.com   bad
1        espdesign.com.au   bad
2      iamagameaddict.com   bad
3           kalantzis.net   bad
4   slightlyoffcenter.net   bad
Unique labels: {'good', 'bad'}




Score: 96.20 %


In [18]:
file = r"C:\Users\Sai keerthan\Downloads\malware_detection_url\Malware-Detection-using-Machine-learning-main\Classifier\pickel_model.pkl"
with open(file, 'wb') as f:
    pickle.dump(lgr, f)
f.close()

file2 = r"C:\Users\Sai keerthan\Downloads\malware_detection_url\Malware-Detection-using-Machine-learning-main\Classifier\pickel_vector.pkl"
with open(file2,'wb') as f2:
    pickle.dump(vectorizer_save, f2)
f2.close()