In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Step 1: Load the dataset
file_path = 'SampleData.csv'
df = pd.read_csv(file_path)

# Step 2: Combine text columns (using list indexing, not tuple indexing)
df['CombinedText'] = df[['LongDescription', 'Title', 'Tagline', 'AdditionalDetails']].fillna('').agg(' '.join, axis=1)

# Step 3: Prepare the target variable
df = df.dropna(subset=['StatusFlag'])
df['Target'] = df['StatusFlag'].apply(lambda x: 0 if x == 'Kill' else 1)

# Step 4: Split the dataset into features and target
X = df['CombinedText']
y = df['Target']

# Step 5: Create a train/test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Extract text features using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Step 7: Train a classifier (Logistic Regression in this example)
clf = LogisticRegression(random_state=42, max_iter=1000)
clf.fit(X_train_vect, y_train)

# Step 8: Make predictions and evaluate the model
y_pred = clf.predict(X_test_vect)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.95      1.00      0.97       978
           1       0.98      0.49      0.65       106

    accuracy                           0.95      1084
   macro avg       0.96      0.74      0.81      1084
weighted avg       0.95      0.95      0.94      1084



In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Step 1: Load the dataset
file_path = 'SampleData.csv'

df = pd.read_csv(file_path)

# Step 2: Combine multiple text columns into one (correct indexing with a list)
df['CombinedText'] = df[['LongDescription', 'Title', 'Tagline', 'AdditionalDetails']].fillna('').agg(' '.join, axis=1)

# Step 3: Prepare the target variable (drop rows with missing StatusFlag and create binary target)
df = df.dropna(subset=['StatusFlag'])
df['Target'] = df['StatusFlag'].apply(lambda x: 0 if x == 'Kill' or None else 1)

# Step 4: Split the dataset into features and target
X = df['CombinedText']
y = df['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Vectorize text data using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Step 6: Train a Logistic Regression model with class_weight='balanced' to address imbalance
clf = LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')
clf.fit(X_train_vect, y_train)

# Step 7: Evaluate the model
y_pred = clf.predict(X_test_vect)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.99      0.97      0.98       978
           1       0.74      0.92      0.82       106

    accuracy                           0.96      1084
   macro avg       0.87      0.94      0.90      1084
weighted avg       0.97      0.96      0.96      1084



In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# --- Data Preparation ---
# Load the dataset
file_path = 'SampleData.csv'
df = pd.read_csv(file_path)

# Combine text fields (ensure you use the correct list indexing)
df['CombinedText'] = df[['LongDescription', 'Title', 'Tagline', 'AdditionalDetails']].fillna('').agg(' '.join, axis=1)

# Filter rows with valid StatusFlag and create binary target: 0 if 'Kill', else 1
df = df.dropna(subset=['StatusFlag'])
df['Target'] = df['StatusFlag'].apply(lambda x: 0 if x == 'Kill' else 1)

# Create missingness indicators for selected numeric columns
numeric_cols = ['AskingPrice', 'Revenue', 'CashFlow_Calc', 'Margin_Calc', 'Multiple_Calc']
for col in numeric_cols:
    df[f'{col}_missing'] = df[col].isnull().astype(int)

# Optionally, you might want to also impute the missing values in these numeric columns.
# Here, we'll use median imputation later in the numeric pipeline.

# Define all numeric features to include (raw values + missingness indicators)
numeric_features = numeric_cols + [f'{col}_missing' for col in numeric_cols]

# --- Building the Pipeline ---

# Pipeline for numeric features: impute missing values and scale them
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Column transformer combining:
# - The text feature from "CombinedText" with TF-IDF vectorization.
# - Numeric features through the numeric pipeline.
preprocessor = ColumnTransformer(transformers=[
    ('text', TfidfVectorizer(stop_words='english', ngram_range=(1, 2)), 'CombinedText'),
    ('num', numeric_pipeline, numeric_features)
])

# Define features (X) and target (y)
X = df  # Our ColumnTransformer extracts columns from the DataFrame directly
y = df['Target']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a full pipeline that applies preprocessing and then a classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('clf', LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced'))
])

# --- Model Training and Evaluation ---
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.99      0.96      0.98       978
           1       0.73      0.93      0.82       106

    accuracy                           0.96      1084
   macro avg       0.86      0.95      0.90      1084
weighted avg       0.97      0.96      0.96      1084



## multiple models

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# --- Data Preparation ---
file_path = 'SampleData.csv'
df = pd.read_csv(file_path)

# Combine text columns into one field
df['CombinedText'] = df[['LongDescription', 'Title', 'Tagline', 'AdditionalDetails']].fillna('').agg(' '.join, axis=1)

# Filter rows with valid StatusFlag and create binary target: 0 if 'Kill', else 1
df = df.dropna(subset=['StatusFlag'])
df['Target'] = df['StatusFlag'].apply(lambda x: 0 if x == 'Kill' else 1)

# Create missingness indicators for selected numeric columns
numeric_cols = ['AskingPrice', 'Revenue', 'CashFlow_Calc', 'Margin_Calc', 'Multiple_Calc']
for col in numeric_cols:
    df[f'{col}_missing'] = df[col].isnull().astype(int)

# List all numeric features (original numeric columns + missingness indicators)
numeric_features = numeric_cols + [f'{col}_missing' for col in numeric_cols]

# --- Preprocessing Pipeline ---
# Pipeline for numeric features: impute missing values and scale them
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# ColumnTransformer to combine text and numeric pipelines
preprocessor = ColumnTransformer(transformers=[
    ('text', TfidfVectorizer(stop_words='english', ngram_range=(1, 2)), 'CombinedText'),
    ('num', numeric_pipeline, numeric_features)
])

# Define features and target
X = df
y = df['Target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
# If you have xgboost installed, you can uncomment the following:
# from xgboost import XGBClassifier

models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced'),
    'Random Forest': RandomForestClassifier(random_state=42, class_weight='balanced'),
    'SVC': SVC(random_state=42, class_weight='balanced', probability=True),
    # 'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
}


In [19]:
for name, model in models.items():
    print(f"Training {name}...")
    
    # Create the pipeline with the current model
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('clf', model)
    ])
    
    # Train and predict
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    # Evaluate
    print(f"Results for {name}:")
    print(classification_report(y_test, y_pred))
    print("-" * 50)


Training Logistic Regression...
Results for Logistic Regression:
              precision    recall  f1-score   support

           0       0.99      0.96      0.98       978
           1       0.73      0.93      0.82       106

    accuracy                           0.96      1084
   macro avg       0.86      0.95      0.90      1084
weighted avg       0.97      0.96      0.96      1084

--------------------------------------------------
Training Random Forest...
Results for Random Forest:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       978
           1       0.99      0.63      0.77       106

    accuracy                           0.96      1084
   macro avg       0.97      0.82      0.88      1084
weighted avg       0.96      0.96      0.96      1084

--------------------------------------------------
Training SVC...
Results for SVC:
              precision    recall  f1-score   support

           0       0.99      0.96     