In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

# Load and prepare data
def load_data(file_path):
    df = pd.read_csv(file_path)
    # Fill missing values
    df['description'].fillna('No description', inplace= True)
    df['requirements'].fillna('No requirements', inplace= True)
    df['location'].fillna('Unknown', inplace= True)
    return df

# Create feature pipeline
def create_pipeline():
    # Text features
    text_features = Pipeline([
        ('tfidf', TfidfVectorizer(max_features= 5000, stop_words= 'english'))
    ])
    
    # Categorial features
    cat_features = Pipeline([
        ('tfidf', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    # Combine all features
    preprocessor = ColumnTransformer([
        ('text', text_features, 'description'),
        ('cat', cat_features, ['location'])
    ])
    
    # Create full pipeline
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(
            n_estimators= 100,
            max_depth= None,
            min_samples_split= 2,
            min_samples_leaf= 1,
            max_features= 'sqrt',
            class_weight= 'balanced',
            random_state= 42
        ))
    ])
    
    return model

def main():
    # Load data
    df = load_data('C:\\Users\\vardh\\.vscode\\Identifying-Fake-Job-Listing\\job_train.csv')
    
    # Prepare features and target
    X = df[['description', 'location']]
    y = df['fradulent']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42, stratify= y)
    
    # Create and train model
    model = create_pipeline()
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Print results
    print("\n Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['Legitimate', 'Fraudulent']))
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv= 5, scoring= 'f1')
    print("\n Cross-validation F1 scores:", cv_scores)
    print(f"Average F1 score: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
    
    

In [4]:
import pickle
with open('job_train_model.pkl', 'wb') as file:
    pickle.dump(model, file)

NameError: name 'model' is not defined