In [None]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [None]:
!kaggle competitions download -c quora-question-pairs


Downloading quora-question-pairs.zip to /content
 99% 306M/309M [00:04<00:00, 65.9MB/s]
100% 309M/309M [00:04<00:00, 68.7MB/s]


In [None]:
import zipfile
with zipfile.ZipFile('quora-question-pairs.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/quora_question_pairs')


In [None]:
import zipfile

# Unzip train.csv.zip
with zipfile.ZipFile('/content/quora_question_pairs/train.csv.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/quora_question_pairs')


## Exploratory Data Analysis and Model Training with Random Forest and XGBoost




In [None]:
# 1. Importing Required Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Load the dataset from a CSV file
df = pd.read_csv("/content/quora_question_pairs/train.csv")


In [None]:
# Print the shape of the DataFrame (number of rows and columns)
print(df.shape)  # Output: (404290, 6)

# Display the first few rows of the DataFrame
print(df.head())

(404290, 6)
   id  qid1  qid2                                          question1  \
0   0     1     2  What is the step by step guide to invest in sh...   
1   1     3     4  What is the story of Kohinoor (Koh-i-Noor) Dia...   
2   2     5     6  How can I increase the speed of my internet co...   
3   3     7     8  Why am I mentally very lonely? How can I solve...   
4   4     9    10  Which one dissolve in water quikly sugar, salt...   

                                           question2  is_duplicate  
0  What is the step by step guide to invest in sh...             0  
1  What would happen if the Indian government sto...             0  
2  How can Internet speed be increased by hacking...             0  
3  Find the remainder when [math]23^{24}[/math] i...             0  
4            Which fish would survive in salt water?             0  


In [None]:
# Randomly sample 30,000 rows from the original DataFrame
new_df = df.sample(30000)

# Check for missing values in the DataFrame
print(new_df.isnull().sum())

# Check for duplicate rows in the DataFrame
print(new_df.duplicated().sum())

id              0
qid1            0
qid2            0
question1       0
question2       0
is_duplicate    0
dtype: int64
0


In [None]:
# Create a new DataFrame with the question pairs
ques_df = new_df[['question1', 'question2']]
print(ques_df.head())

                                                question1  \
381948  What are some ways parents can teach young chi...   
82012            What should I do to speed up my laptop ?   
93355                          How and why did trump win?   
268787  How can I know whether I have what it takes to...   
386261             What are the top Android apps of 2016?   

                                                question2  
381948  When your adult child passes away how do you h...  
82012                      How do I speed up my computer?  
93355              How did Donald Trump win the election?  
268787  How do I know whether I’ll be a good programme...  
386261          What are the top 10 Android Apps of 2015?  


In [None]:
from sklearn.feature_extraction.text import CountVectorizer  # Import CountVectorizer for text feature extraction

# Merge the texts from both question columns
questions = list(ques_df['question1']) + list(ques_df['question2'])

# Initialize CountVectorizer with a maximum of 3000 features
cv = CountVectorizer(max_features=3000)

# Transform the questions into a count matrix and split into two arrays
q1_arr, q2_arr = np.vsplit(cv.fit_transform(questions).toarray(), 2)

# Create DataFrames for each question's feature array
temp_df1 = pd.DataFrame(q1_arr, index=ques_df.index)
temp_df2 = pd.DataFrame(q2_arr, index=ques_df.index)

# Concatenate the two DataFrames into a single DataFrame
temp_df = pd.concat([temp_df1, temp_df2], axis=1)
print(temp_df.shape)  # Output: (30000, 6000)

(30000, 6000)


In [None]:
# Add the target variable to the features DataFrame
temp_df['is_duplicate'] = new_df['is_duplicate']
print(temp_df.head())

        0  1  2  3  4  5  6  7  8  9  ...  2991  2992  2993  2994  2995  2996  \
381948  0  0  0  0  0  0  0  0  0  0  ...     1     0     1     0     0     0   
82012   0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
93355   0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
268787  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
386261  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   

        2997  2998  2999  is_duplicate  
381948     0     0     0             0  
82012      0     0     0             1  
93355      0     0     0             1  
268787     0     0     0             1  
386261     0     0     0             0  

[5 rows x 6001 columns]


In [None]:
from sklearn.model_selection import train_test_split  # Importing train_test_split for splitting data

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    temp_df.iloc[:, 0:-1].values,  # Features
    temp_df.iloc[:, -1].values,  # Target variable
    test_size=0.2,  # Use 20% of data for testing
    random_state=1  # Seed for reproducibility
)


In [None]:
from sklearn.ensemble import RandomForestClassifier  # Import RandomForestClassifier
from sklearn.metrics import accuracy_score  # Import accuracy_score to evaluate performance

# Initialize and train a Random Forest classifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf.predict(X_test)

# Calculate and print the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Random Forest Accuracy: {accuracy:.3f}')  # Output: e.g., 0.742

Random Forest Accuracy: 0.747


In [None]:
from xgboost import XGBClassifier  # Importing XGBoost classifier

# Initialize and train an XGBoost classifier
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

# Make predictions on the test set
y_pred_xgb = xgb.predict(X_test)

# Optionally calculate and print the accuracy for the XGBoost model
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f'XGBoost Accuracy: {accuracy_xgb:.3f}')  # Output example

XGBoost Accuracy: 0.729
