In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import joblib
import os

In [None]:
import pandas as pd

# Load the dataset (example filename)
df = pd.read_csv("/content/drive/MyDrive/INTERN/companies_sorted.csv") #, usecols=["name", "domain", "industry", "description"])



In [None]:
  df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7173426 entries, 0 to 7173425
Data columns (total 11 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   Unnamed: 0                 int64  
 1   name                       object 
 2   domain                     object 
 3   year founded               float64
 4   industry                   object 
 5   size range                 object 
 6   locality                   object 
 7   country                    object 
 8   linkedin url               object 
 9   current employee estimate  int64  
 10  total employee estimate    int64  
dtypes: float64(1), int64(3), object(7)
memory usage: 602.0+ MB


In [None]:
df.shape

(7173426, 11)

In [None]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,name,domain,year founded,industry,size range,locality,country,linkedin url,current employee estimate,total employee estimate
0,5872184,ibm,ibm.com,1911.0,information technology and services,10001+,"new york, new york, united states",united states,linkedin.com/company/ibm,274047,716906
1,4425416,tata consultancy services,tcs.com,1968.0,information technology and services,10001+,"bombay, maharashtra, india",india,linkedin.com/company/tata-consultancy-services,190771,341369


In [None]:
df.describe()

Unnamed: 0.1,Unnamed: 0,year founded,current employee estimate,total employee estimate
count,7173426.0,3566446.0,7173426.0,7173426.0
mean,3586712.0,2001.747,13.87121,32.25438
std,2070790.0,20.96627,354.5919,874.1556
min,0.0,1451.0,0.0,1.0
25%,1793356.0,1999.0,1.0,1.0
50%,3586712.0,2009.0,1.0,2.0
75%,5380069.0,2013.0,4.0,7.0
max,7173425.0,2103.0,274047.0,716906.0


In [None]:
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
name,3
domain,1650621
year founded,3606980
industry,290003
size range,0
locality,2508825
country,2349207
linkedin url,0
current employee estimate,0


In [None]:
# Load your huge dataset
df = pd.read_csv("/content/drive/MyDrive/INTERN/companies_sorted.csv")

# Drop rows with no industry
df = df.dropna(subset=["industry"])

# Optional: keep only top N industries
top_industries = df["industry"].value_counts().nlargest(10).index
df = df[df["industry"].isin(top_industries)]

# Create synthetic description
df["Description"] = (
    "A " + df["industry"].fillna("company") +
    " company based in " + df["country"].fillna("unknown") +
    " operating in the " + df["size range"].fillna("N/A") +
    " segment."
)

# Create final DataFrame
df_final = df[["Description", "industry"]].rename(columns={"industry": "Industry"})

# Drop nulls just in case
df_final = df_final.dropna()

# Create the 'data' directory if it doesn't exist
import os
os.makedirs("data", exist_ok=True)

# Save 100,000 rows for training
df_final.sample(100_000, random_state=42).to_csv("data/sample_leads.csv", index=False)

print("✅ sample_leads.csv ready for training")

✅ sample_leads.csv ready for training


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1437169 entries, 0 to 7173421
Data columns (total 2 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   Description  1437169 non-null  object
 1   Industry     1437169 non-null  object
dtypes: object(2)
memory usage: 32.9+ MB


In [None]:
# Step 1: Load and clean data
df = df_final
df.dropna(subset=["Description", "Industry"], inplace=True)
df = df[df["Description"].str.len() > 10]  # Drop short descriptions

In [None]:
# Optional: limit to top 5 industries for balance
top_industries = df["Industry"].value_counts().nlargest(5).index
df = df[df["Industry"].isin(top_industries)]

In [None]:
# Step 2: Train-test split
X_train, X_test, y_train, y_test = train_test_split(df["Description"], df["Industry"], test_size=0.2, random_state=42)

# Step 3: Build pipeline
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english", max_features=10000)),
    ("select", SelectKBest(score_func=chi2, k=1000)),
    ("clf", LogisticRegression(max_iter=300))
])

# Step 4: GridSearchCV
params = {
    "clf": [LogisticRegression(max_iter=300), MultinomialNB()],
    "tfidf__ngram_range": [(1, 1), (1, 2)],
    "tfidf__min_df": [3, 5],
    "tfidf__max_df": [0.8, 0.9]
}

In [None]:
grid = GridSearchCV(pipeline, param_grid=params, cv=3, verbose=2, n_jobs=-1)
grid.fit(X_train, y_train)


Fitting 3 folds for each of 16 candidates, totalling 48 fits




In [None]:
# Step 5: Evaluate
y_pred = grid.predict(X_test)
print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred))



📊 Classification Report:
                                     precision    recall  f1-score   support

                       construction       1.00      1.00      1.00     51352
information technology and services       1.00      1.00      1.00     90731
              management consulting       1.00      1.00      1.00     38707
          marketing and advertising       1.00      1.00      1.00     68060
                        real estate       1.00      1.00      1.00     38584

                           accuracy                           1.00    287434
                          macro avg       1.00      1.00      1.00    287434
                       weighted avg       1.00      1.00      1.00    287434



In [None]:
# Step 6: Save model
best_pipeline = grid.best_estimator_
vectorizer = best_pipeline.named_steps["tfidf"]
classifier = best_pipeline.named_steps["clf"]

os.makedirs("model", exist_ok=True)
joblib.dump(classifier, "model/lead_classifier.pkl")
joblib.dump(vectorizer, "model/tfidf_vectorizer.pkl")
print("\n✅ Model and vectorizer saved to /model/")



✅ Model and vectorizer saved to /model/


In [None]:
# Assuming df_final is a pandas DataFrame
df_final.to_csv('df_final.csv')

In [2]:
# Step 1: Load and clean data
df = pd.read_csv("/content/df_final.csv")
df.dropna(subset=["Description", "Industry"], inplace=True)
df = df[df["Description"].str.len() > 10]  # Drop short descriptions

In [3]:
# Optional: limit to top 5 industries for balance
top_industries = df["Industry"].value_counts().nlargest(5).index
df = df[df["Industry"].isin(top_industries)]

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split, GridSearchCV

# Step 2: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df["Description"], df["Industry"], test_size=0.2, random_state=42
)

# Step 3: Build pipeline using only MultinomialNB
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english", max_features=10000)),
    ("select", SelectKBest(score_func=chi2, k=1000)),
    ("clf", MultinomialNB())
])

# Step 4: GridSearchCV (optimize only TF-IDF params)
params = {
    "tfidf__ngram_range": [(1, 1), (1, 2)],
    "tfidf__min_df": [3, 5],
    "tfidf__max_df": [0.8, 0.9]
}



In [5]:
grid = GridSearchCV(pipeline, param_grid=params, cv=3, verbose=2, n_jobs=-1)
grid.fit(X_train, y_train)


Fitting 3 folds for each of 8 candidates, totalling 24 fits




In [6]:
# Step 5: Evaluate
y_pred = grid.predict(X_test)
print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred))



📊 Classification Report:
                                     precision    recall  f1-score   support

                  computer software       1.00      1.00      1.00     23042
                       construction       1.00      1.00      1.00     30747
information technology and services       1.00      1.00      1.00     56934
          marketing and advertising       1.00      1.00      1.00     41100
                        real estate       1.00      1.00      1.00     24348

                           accuracy                           1.00    176171
                          macro avg       1.00      1.00      1.00    176171
                       weighted avg       1.00      1.00      1.00    176171

