In [2]:
import pandas as pd

# Read the files with latin1 encoding
train_df = pd.read_csv('train.csv', encoding='latin1')
test_df = pd.read_csv('test.csv', encoding='latin1')

# Save them back in UTF-8 encoding
train_df.to_csv('train_utf8.csv', index=False, encoding='utf-8')
test_df.to_csv('test_utf8.csv', index=False, encoding='utf-8')

print("Files converted to UTF-8 successfully.")


Files converted to UTF-8 successfully.


In [3]:
train_df = pd.read_csv('train_utf8.csv')
test_df = pd.read_csv('test_utf8.csv')


In [4]:
# Check the structure of the training data
print("Training Data Info:")
print(train_df.info())

# Check for missing values
print("\nMissing Values in Training Data:")
print(train_df.isnull().sum())

# Display some unique values in the columns
for column in train_df.columns:
    print(f"\nUnique values in '{column}':")
    print(train_df[column].unique()[:5])  # Show only first 5 unique values


Training Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93686 entries, 0 to 93685
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   category            93686 non-null  object
 1   sub_category        87095 non-null  object
 2   crimeaditionalinfo  93665 non-null  object
dtypes: object(3)
memory usage: 2.1+ MB
None

Missing Values in Training Data:
category                 0
sub_category          6591
crimeaditionalinfo      21
dtype: int64

Unique values in 'category':
['Online and Social Media Related Crime' 'Online Financial Fraud'
 'Online Gambling  Betting' 'RapeGang Rape RGRSexually Abusive Content'
 'Any Other Cyber Crime']

Unique values in 'sub_category':
['Cyber Bullying  Stalking  Sexting' 'Fraud CallVishing'
 'Online Gambling  Betting' 'Online Job Fraud' 'UPI Related Frauds']

Unique values in 'crimeaditionalinfo':
['I had continue received random calls and abusive messages in my

In [5]:
# Drop rows with missing 'crimeaditionalinfo'
train_df = train_df.dropna(subset=['crimeaditionalinfo'])

# Fill missing 'sub_category' with a placeholder (e.g., 'Unknown')
train_df['sub_category'] = train_df['sub_category'].fillna('Unknown')

# Verify the changes
print("Updated Missing Values:")
print(train_df.isnull().sum())


Updated Missing Values:
category              0
sub_category          0
crimeaditionalinfo    0
dtype: int64


In [6]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [7]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Preprocessing function
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Removing punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stop words and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing to the 'crimeaditionalinfo' column
train_df['processed_text'] = train_df['crimeaditionalinfo'].apply(preprocess_text)

# Display the processed text
print("Sample Processed Text:")
print(train_df[['crimeaditionalinfo', 'processed_text']].head())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Sample Processed Text:
                                  crimeaditionalinfo  \
0  I had continue received random calls and abusi...   
1  The above fraudster is continuously messaging ...   
2  He is acting like a police and demanding for m...   
3  In apna Job I have applied for job interview f...   
4  I received a call from lady stating that she w...   

                                      processed_text  
0  continue received random call abusive message ...  
1  fraudster continuously messaging asking pay mo...  
2  acting like police demanding money adding sect...  
3  apna job applied job interview telecalling res...  
4  received call lady stating send new phone vivo...  


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit to top 5000 features for efficiency

# Fit and transform the 'processed_text' column
X = tfidf_vectorizer.fit_transform(train_df['processed_text'])

# Display the shape of the resulting feature matrix
print("Feature Matrix Shape:", X.shape)


Feature Matrix Shape: (93665, 5000)


In [9]:
# Merge rare categories into a broader category
train_df['category'] = train_df['category'].replace({
    'Report Unlawful Content': 'Any Other Cyber Crime'  # Replace with a broader category
})

# Merge rare subcategories into a broader subcategory
train_df['sub_category'] = train_df['sub_category'].replace({
    'Against Interest of sovereignty or integrity of India': 'Unknown'  # Replace with a broader subcategory
})

# Verify the changes
print("Updated Category Values:", train_df['category'].unique())
print("Updated Subcategory Values:", train_df['sub_category'].unique())


Updated Category Values: ['Online and Social Media Related Crime' 'Online Financial Fraud'
 'Online Gambling  Betting' 'RapeGang Rape RGRSexually Abusive Content'
 'Any Other Cyber Crime' 'Cyber Attack/ Dependent Crimes'
 'Cryptocurrency Crime' 'Sexually Explicit Act'
 'Sexually Obscene material'
 'Hacking  Damage to computercomputer system etc' 'Cyber Terrorism'
 'Child Pornography CPChild Sexual Abuse Material CSAM'
 'Online Cyber Trafficking' 'Ransomware']
Updated Subcategory Values: ['Cyber Bullying  Stalking  Sexting' 'Fraud CallVishing'
 'Online Gambling  Betting' 'Online Job Fraud' 'UPI Related Frauds'
 'Internet Banking Related Fraud' 'Unknown' 'Other'
 'Profile Hacking Identity Theft' 'DebitCredit Card FraudSim Swap Fraud'
 'EWallet Related Fraud' 'Data Breach/Theft' 'Cheating by Impersonation'
 'Denial of Service (DoS)/Distributed Denial of Service (DDOS) attacks'
 'FakeImpersonating Profile' 'Cryptocurrency Fraud' 'Malware Attack'
 'Business Email CompromiseEmail Takeover' '

In [10]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoders
category_encoder = LabelEncoder()
subcategory_encoder = LabelEncoder()

# Encode the 'category' and 'sub_category' columns
y_category = category_encoder.fit_transform(train_df['category'])
y_subcategory = subcategory_encoder.fit_transform(train_df['sub_category'])

# Display the encoded categories and subcategories
print("Encoded Categories Sample:", y_category[:5])
print("Encoded Subcategories Sample:", y_subcategory[:5])

# Save the encoders for decoding predictions later
import joblib
joblib.dump(category_encoder, 'category_encoder.pkl')
joblib.dump(subcategory_encoder, 'subcategory_encoder.pkl')


Encoded Categories Sample: [9 7 8 9 7]
Encoded Subcategories Sample: [ 3 14 20 21 14]


['subcategory_encoder.pkl']

In [11]:
from sklearn.model_selection import train_test_split

# Split the data for category classification
X_train, X_val, y_category_train, y_category_val = train_test_split(
    X, y_category, test_size=0.2, random_state=42, stratify=y_category
)

# Split the data for subcategory classification
_, _, y_subcategory_train, y_subcategory_val = train_test_split(
    X, y_subcategory, test_size=0.2, random_state=42, stratify=y_subcategory
)

# Check the shape of the splits
print("Training Set Shape:", X_train.shape)
print("Validation Set Shape:", X_val.shape)


Training Set Shape: (74932, 5000)
Validation Set Shape: (18733, 5000)


In [12]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Initialize a base classifier
base_classifier = RandomForestClassifier(random_state=42)

# Wrap it in a multi-output classifier
multi_output_model = MultiOutputClassifier(base_classifier)

# Prepare the targets as a combined array
y_combined_train = list(zip(y_category_train, y_subcategory_train))
y_combined_val = list(zip(y_category_val, y_subcategory_val))

# Train the model
multi_output_model.fit(X_train, y_combined_train)

# Make predictions
y_combined_pred = multi_output_model.predict(X_val)

# Separate predictions for evaluation
y_category_pred, y_subcategory_pred = zip(*y_combined_pred)

# Evaluate category predictions
print("Category Classification Report:")
print(classification_report(y_category_val, y_category_pred))

# Evaluate subcategory predictions
print("Subcategory Classification Report:")
print(classification_report(y_subcategory_val, y_subcategory_pred))


Category Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.09      0.15      2176
           1       0.81      0.17      0.28        76
           2       0.78      0.07      0.13        96
           3       1.00      1.00      1.00       722
           4       0.00      0.00      0.00        32
           5       0.61      0.06      0.10       342
           6       0.00      0.00      0.00        36
           7       0.77      0.98      0.86     11483
           8       0.67      0.02      0.04        89
           9       0.59      0.57      0.58      2428
          10       0.00      0.00      0.00        11
          11       1.00      0.93      0.96       564
          12       0.95      0.06      0.12       310
          13       0.78      0.10      0.17       368

    accuracy                           0.76     18733
   macro avg       0.61      0.29      0.31     18733
weighted avg       0.74      0.76      0.70     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
# Updated preprocessing function
def preprocess_text(text):
    if isinstance(text, str):  # Ensure the input is a string
        # Lowercasing
        text = text.lower()
        # Removing punctuation and numbers
        text = re.sub(r'[^a-z\s]', '', text)
        # Tokenization
        tokens = word_tokenize(text)
        # Remove stop words and lemmatize
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
        return ' '.join(tokens)
    else:
        return ''  # Return an empty string for non-string inputs
# Preprocess the test set
test_df['processed_text'] = test_df['crimeaditionalinfo'].apply(preprocess_text)

# Check if there are any issues in the processed text
print("Processed Test Set Sample:")
print(test_df[['crimeaditionalinfo', 'processed_text']].head())


Processed Test Set Sample:
                                  crimeaditionalinfo  \
0  Sir namaskar  mein Ranjit Kumar PatraPaise neh...   
1          KOTAK MAHINDRA BANK FRAUD\r\nFRAUD AMOUNT   
2  The issue actually started when I got this ema...   
3  I am amit kumar from karwi chitrakoot I am tot...   
4  I have ordered  saree and  blouse from rinki s...   

                                      processed_text  
0  sir namaskar mein ranjit kumar patrapaise nehi...  
1             kotak mahindra bank fraud fraud amount  
2  issue actually started got email first glance ...  
3  amit kumar karwi chitrakoot totally depressed ...  
4  ordered saree blouse rinki sur paid amount tak...  


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit to top 5000 features for efficiency

# Fit and transform the 'processed_text' column
X = tfidf_vectorizer.fit_transform(test_df['processed_text'])

# Display the shape of the resulting feature matrix
print("Feature Matrix Shape:", X.shape)

Feature Matrix Shape: (31229, 5000)


In [23]:
# Merge rare categories into a broader category
test_df['category'] = test_df['category'].replace({
    'Report Unlawful Content': 'Any Other Cyber Crime'  # Replace with a broader category
})

# Merge rare subcategories into a broader subcategory
test_df['sub_category'] = test_df['sub_category'].replace({
    'Against Interest of sovereignty or integrity of India': 'Unknown'  # Replace with a broader subcategory
})

# Verify the changes
print("Updated Category Values:", train_df['category'].unique())
print("Updated Subcategory Values:", train_df['sub_category'].unique())


Updated Category Values: ['Online and Social Media Related Crime' 'Online Financial Fraud'
 'Online Gambling  Betting' 'RapeGang Rape RGRSexually Abusive Content'
 'Any Other Cyber Crime' 'Cyber Attack/ Dependent Crimes'
 'Cryptocurrency Crime' 'Sexually Explicit Act'
 'Sexually Obscene material'
 'Hacking  Damage to computercomputer system etc' 'Cyber Terrorism'
 'Child Pornography CPChild Sexual Abuse Material CSAM'
 'Online Cyber Trafficking' 'Ransomware']
Updated Subcategory Values: ['Cyber Bullying  Stalking  Sexting' 'Fraud CallVishing'
 'Online Gambling  Betting' 'Online Job Fraud' 'UPI Related Frauds'
 'Internet Banking Related Fraud' 'Unknown' 'Other'
 'Profile Hacking Identity Theft' 'DebitCredit Card FraudSim Swap Fraud'
 'EWallet Related Fraud' 'Data Breach/Theft' 'Cheating by Impersonation'
 'Denial of Service (DoS)/Distributed Denial of Service (DDOS) attacks'
 'FakeImpersonating Profile' 'Cryptocurrency Fraud' 'Malware Attack'
 'Business Email CompromiseEmail Takeover' '

In [26]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoders
category_encoder = LabelEncoder()
subcategory_encoder = LabelEncoder()

# Encode the 'category' and 'sub_category' columns
y_category_test = category_encoder.fit_transform(test_df['category'])
y_subcategory_test = subcategory_encoder.fit_transform(test_df['sub_category'])

# Display the encoded categories and subcategories
print("Encoded Categories Sample:", y_category[:5])
print("Encoded Subcategories Sample:", y_subcategory[:5])



Encoded Categories Sample: [9 7 8 9 7]
Encoded Subcategories Sample: [ 3 14 20 21 14]


In [27]:
# Make predictions
y_combined_test_pred = multi_output_model.predict(X_test)

# Separate predictions
y_category_test_pred, y_subcategory_test_pred = zip(*y_combined_test_pred)

if y_category_test is not None and y_subcategory_test is not None:
    # Evaluate category predictions
    print("Category Classification Report (Test Set):")
    print(classification_report(y_category_test, y_category_test_pred))

    # Evaluate subcategory predictions
    print("Subcategory Classification Report (Test Set):")
    print(classification_report(y_subcategory_test, y_subcategory_test_pred))
else:
    print("Predictions made, but no labels available for evaluation.")


Category Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.11      0.02      0.03      3620
           1       0.00      0.00      0.00       118
           2       0.00      0.00      0.00       165
           3       0.04      0.04      0.04      1222
           4       0.00      0.00      0.00        55
           5       0.00      0.00      0.00       580
           6       0.00      0.00      0.00        63
           7       0.61      0.78      0.68     19032
           8       0.00      0.00      0.00       150
           9       0.12      0.11      0.11      4079
          10       0.00      0.00      0.00        14
          11       0.03      0.02      0.02       972
          12       0.02      0.00      0.00       531
          13       0.01      0.00      0.00       622
          14       0.00      0.00      0.00         6

    accuracy                           0.49     31229
   macro avg       0.06      0.06    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        97
           1       0.04      0.00      0.01       695
           2       0.00      0.00      0.00       165
           3       0.06      0.01      0.01      1331
           4       0.00      0.00      0.00        55
           5       0.00      0.00      0.00        39
           6       0.00      0.00      0.00       153
           7       0.11      0.02      0.04      3573
           8       0.00      0.00      0.00       265
           9       0.00      0.00      0.00       167
          10       0.00      0.00      0.00        57
          11       0.03      0.00      0.01      1357
          12       0.00      0.00      0.00       117
          13       0.03      0.00      0.00       803
          14       0.04      0.01      0.01      1907
          15       0.00      0.00      0.00       184
          16       0.00      0.00      0.00        17
          17       0.12    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
