In [1]:
import pandas as pd

# Read the files with latin1 encoding
train_df = pd.read_csv('train.csv', encoding='latin1')
test_df = pd.read_csv('test.csv', encoding='latin1')

# Save them back in UTF-8 encoding
train_df.to_csv('train_utf8.csv', index=False, encoding='utf-8')
test_df.to_csv('test_utf8.csv', index=False, encoding='utf-8')

print("Files converted to UTF-8 successfully.")


Files converted to UTF-8 successfully.


In [2]:
train_df = pd.read_csv('train_utf8.csv')
test_df = pd.read_csv('test_utf8.csv')


In [3]:
# Check the structure of the training data
print("Training Data Info:")
print(train_df.info())

# Check for missing values
print("\nMissing Values in Training Data:")
print(train_df.isnull().sum())

# Display some unique values in the columns
for column in train_df.columns:
    print(f"\nUnique values in '{column}':")
    print(train_df[column].unique()[:5])  # Show only first 5 unique values


Training Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93686 entries, 0 to 93685
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   category            93686 non-null  object
 1   sub_category        87095 non-null  object
 2   crimeaditionalinfo  93665 non-null  object
dtypes: object(3)
memory usage: 2.1+ MB
None

Missing Values in Training Data:
category                 0
sub_category          6591
crimeaditionalinfo      21
dtype: int64

Unique values in 'category':
['Online and Social Media Related Crime' 'Online Financial Fraud'
 'Online Gambling  Betting' 'RapeGang Rape RGRSexually Abusive Content'
 'Any Other Cyber Crime']

Unique values in 'sub_category':
['Cyber Bullying  Stalking  Sexting' 'Fraud CallVishing'
 'Online Gambling  Betting' 'Online Job Fraud' 'UPI Related Frauds']

Unique values in 'crimeaditionalinfo':
['I had continue received random calls and abusive messages in my

In [4]:
# Drop rows with missing 'crimeaditionalinfo'
train_df = train_df.dropna(subset=['crimeaditionalinfo'])

# Fill missing 'sub_category' with a placeholder (e.g., 'Unknown')
train_df['sub_category'] = train_df['sub_category'].fillna('Unknown')

# Verify the changes
print("Updated Missing Values:")
print(train_df.isnull().sum())


Updated Missing Values:
category              0
sub_category          0
crimeaditionalinfo    0
dtype: int64


In [5]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [6]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Preprocessing function
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Removing punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stop words and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing to the 'crimeaditionalinfo' column
train_df['processed_text'] = train_df['crimeaditionalinfo'].apply(preprocess_text)

# Display the processed text
print("Sample Processed Text:")
print(train_df[['crimeaditionalinfo', 'processed_text']].head())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Sample Processed Text:
                                  crimeaditionalinfo  \
0  I had continue received random calls and abusi...   
1  The above fraudster is continuously messaging ...   
2  He is acting like a police and demanding for m...   
3  In apna Job I have applied for job interview f...   
4  I received a call from lady stating that she w...   

                                      processed_text  
0  continue received random call abusive message ...  
1  fraudster continuously messaging asking pay mo...  
2  acting like police demanding money adding sect...  
3  apna job applied job interview telecalling res...  
4  received call lady stating send new phone vivo...  


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit to top 5000 features for efficiency

# Fit and transform the 'processed_text' column
X = tfidf_vectorizer.fit_transform(train_df['processed_text'])

# Display the shape of the resulting feature matrix
print("Feature Matrix Shape:", X.shape)


Feature Matrix Shape: (93665, 5000)


In [13]:
# Merge rare categories into a broader category
train_df['category'] = train_df['category'].replace({
    'Report Unlawful Content': 'Any Other Cyber Crime'  # Replace with a broader category
})

# Merge rare subcategories into a broader subcategory
train_df['sub_category'] = train_df['sub_category'].replace({
    'Against Interest of sovereignty or integrity of India': 'Unknown'  # Replace with a broader subcategory
})

# Verify the changes
print("Updated Category Values:", train_df['category'].unique())
print("Updated Subcategory Values:", train_df['sub_category'].unique())


Updated Category Values: ['Online and Social Media Related Crime' 'Online Financial Fraud'
 'Online Gambling  Betting' 'RapeGang Rape RGRSexually Abusive Content'
 'Any Other Cyber Crime' 'Cyber Attack/ Dependent Crimes'
 'Cryptocurrency Crime' 'Sexually Explicit Act'
 'Sexually Obscene material'
 'Hacking  Damage to computercomputer system etc' 'Cyber Terrorism'
 'Child Pornography CPChild Sexual Abuse Material CSAM'
 'Online Cyber Trafficking' 'Ransomware']
Updated Subcategory Values: ['Cyber Bullying  Stalking  Sexting' 'Fraud CallVishing'
 'Online Gambling  Betting' 'Online Job Fraud' 'UPI Related Frauds'
 'Internet Banking Related Fraud' 'Unknown' 'Other'
 'Profile Hacking Identity Theft' 'DebitCredit Card FraudSim Swap Fraud'
 'EWallet Related Fraud' 'Data Breach/Theft' 'Cheating by Impersonation'
 'Denial of Service (DoS)/Distributed Denial of Service (DDOS) attacks'
 'FakeImpersonating Profile' 'Cryptocurrency Fraud' 'Malware Attack'
 'Business Email CompromiseEmail Takeover' '

In [14]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoders
category_encoder = LabelEncoder()
subcategory_encoder = LabelEncoder()

# Encode the 'category' and 'sub_category' columns
y_category = category_encoder.fit_transform(train_df['category'])
y_subcategory = subcategory_encoder.fit_transform(train_df['sub_category'])

# Display the encoded categories and subcategories
print("Encoded Categories Sample:", y_category[:5])
print("Encoded Subcategories Sample:", y_subcategory[:5])

# Save the encoders for decoding predictions later
import joblib
joblib.dump(category_encoder, 'category_encoder.pkl')
joblib.dump(subcategory_encoder, 'subcategory_encoder.pkl')


Encoded Categories Sample: [9 7 8 9 7]
Encoded Subcategories Sample: [ 3 14 20 21 14]


['subcategory_encoder.pkl']

In [15]:
from sklearn.model_selection import train_test_split

# Split the data for category classification
X_train, X_val, y_category_train, y_category_val = train_test_split(
    X, y_category, test_size=0.2, random_state=42, stratify=y_category
)

# Split the data for subcategory classification
_, _, y_subcategory_train, y_subcategory_val = train_test_split(
    X, y_subcategory, test_size=0.2, random_state=42, stratify=y_subcategory
)

# Check the shape of the splits
print("Training Set Shape:", X_train.shape)
print("Validation Set Shape:", X_val.shape)


Training Set Shape: (74932, 5000)
Validation Set Shape: (18733, 5000)


In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Initialize the Logistic Regression model
category_model = LogisticRegression(max_iter=1000, random_state=42)

# Train the model on the training set
category_model.fit(X_train, y_category_train)

# Make predictions on the validation set
y_category_pred = category_model.predict(X_val)

# Evaluate the model
print("Category Classification Report:")
print(classification_report(y_category_val, y_category_pred))
print("Category Classification Accuracy:", accuracy_score(y_category_val, y_category_pred))


Category Classification Report:
              precision    recall  f1-score   support

           0       0.45      0.26      0.33      2176
           1       0.73      0.14      0.24        76
           2       0.74      0.36      0.49        96
           3       1.00      1.00      1.00       722
           4       0.00      0.00      0.00        32
           5       0.49      0.23      0.31       342
           6       0.00      0.00      0.00        36
           7       0.81      0.94      0.87     11483
           8       0.33      0.01      0.02        89
           9       0.56      0.59      0.57      2428
          10       0.00      0.00      0.00        11
          11       1.00      0.92      0.96       564
          12       0.17      0.01      0.02       310
          13       0.33      0.09      0.14       368

    accuracy                           0.76     18733
   macro avg       0.47      0.33      0.35     18733
weighted avg       0.72      0.76      0.73     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
# Initialize the Logistic Regression model for subcategory classification
subcategory_model = LogisticRegression(max_iter=1000, random_state=42)

# Train the model on the training set
subcategory_model.fit(X_train, y_subcategory_train)

# Make predictions on the validation set
y_subcategory_pred = subcategory_model.predict(X_val)

# Evaluate the model
print("Subcategory Classification Report:")
print(classification_report(y_subcategory_val, y_subcategory_pred))
print("Subcategory Classification Accuracy:", accuracy_score(y_subcategory_val, y_subcategory_pred))


Subcategory Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        58
           1       0.00      0.00      0.00       398
           2       0.00      0.00      0.00        96
           3       0.00      0.00      0.00       818
           4       0.00      0.00      0.00        32
           5       0.00      0.00      0.00        22
           6       0.00      0.00      0.00        97
           7       0.11      0.02      0.03      2160
           8       0.00      0.00      0.00       152
           9       0.00      0.00      0.00       101
          10       0.00      0.00      0.00        31
          11       0.00      0.00      0.00       809
          12       0.00      0.00      0.00        70
          13       0.00      0.00      0.00       460
          14       0.00      0.00      0.00      1161
          15       0.00      0.00      0.00       108
          16       0.00      0.00      0.00   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [20]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to oversample the minority classes
smote = SMOTE(random_state=42)
X_train_smote, y_subcategory_train_smote = smote.fit_resample(X_train, y_subcategory_train)

# Train the model on the balanced dataset
subcategory_model = LogisticRegression(max_iter=1000, random_state=42)
subcategory_model.fit(X_train_smote, y_subcategory_train_smote)

# Evaluate the model
y_subcategory_pred = subcategory_model.predict(X_val)

print("Subcategory Classification Report (with SMOTE):")
print(classification_report(y_subcategory_val, y_subcategory_pred))
print("Subcategory Classification Accuracy (with SMOTE):", accuracy_score(y_subcategory_val, y_subcategory_pred))


Subcategory Classification Report (with SMOTE):
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        58
           1       0.02      0.03      0.02       398
           2       0.01      0.04      0.01        96
           3       0.03      0.01      0.02       818
           4       0.00      0.03      0.00        32
           5       0.00      0.00      0.00        22
           6       0.00      0.02      0.01        97
           7       0.10      0.02      0.04      2160
           8       0.01      0.03      0.01       152
           9       0.00      0.00      0.00       101
          10       0.00      0.00      0.00        31
          11       0.04      0.02      0.03       809
          12       0.00      0.01      0.00        70
          13       0.03      0.02      0.02       460
          14       0.05      0.02      0.02      1161
          15       0.00      0.03      0.01       108
          16       0.00      0.00