In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("train.csv", encoding='ISO-8859-1')
df.head()


In [None]:
df

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.head()

## Analysing Training Dataset

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

#### Distribution of duplicate and non-duplicate questions

In [None]:

print(df['is_duplicate'].value_counts())
print((df['is_duplicate'].value_counts()/df['is_duplicate'].count())*100)
df['is_duplicate'].value_counts().plot(kind='bar')

#### Finding total number of unique and repeated questions

In [None]:

qid = pd.Series(df['qid1'].tolist() + df['qid2'].tolist())
print('Number of unique questions',np.unique(qid).shape[0])
x = qid.value_counts()>1
print('Number of questions getting repeated',x[x].shape[0])

#### Frequency of Repeated Questions

In [None]:
# Repeated questions histogram
plt.hist(qid.value_counts().values, bins=160)
plt.yscale('log')
plt.xlabel('Frequency of Repeated Questions')
plt.ylabel('Count (log scale)')
plt.show()


###  Feature Engineering

In [None]:
df['q2_len'] = df['question2'].str.len()

In [None]:
df.head()

#### Finfing the total number of words in question1 and question2

In [None]:

df['q1_num_words'] = df['question1'].apply(lambda row: len(str(row).split(" ")))
df['q2_num_words'] = df['question2'].apply(lambda row: len(str(row).split(" ")))
df.head()


#### This method is used to find the common words in both the questions

In [None]:
def common_words(row):
    if pd.isnull(row['question1']) or pd.isnull(row['question2']):
        return 0
    w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))
    return len(w1 & w2)


In [None]:
df['word_common'] = df.apply(common_words, axis=1)
df.head()

#### This method is used to find the Total words in both the questions

In [None]:
def total_words(row):
    if pd.isnull(row['question1']) or pd.isnull(row['question2']):
        return 0
    w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))
    return len(w1) + len(w2)

df['word_total'] = df.apply(total_words, axis=1)

In [None]:
df['word_share'] = round(df['word_common']/df['word_total'],2)
df.head()

## Below are the graphs related to above methods 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Drop rows with NaN values in 'q1_len' column
df_no_nan = df.dropna(subset=['q1_len'])

# Analysis of features
sns.distplot(df_no_nan['q1_len'])
print('minimum characters', df_no_nan['q1_len'].min())
print('maximum characters', df_no_nan['q1_len'].max())
print('average num of characters', int(df_no_nan['q1_len'].mean()))

plt.show()


In [None]:

sns.distplot(df[df['is_duplicate'] == 0]['word_common'], label='non duplicate')
sns.distplot(df[df['is_duplicate'] == 1]['word_common'], label='duplicate')

plt.xlabel('Word Common Count')
plt.ylabel('Density')

plt.legend()

# Show the plot
plt.show()


In [None]:

sns.distplot(df[df['is_duplicate'] == 0]['word_total'], label='non duplicate')
sns.distplot(df[df['is_duplicate'] == 1]['word_total'], label='duplicate')

plt.xlabel('Word Total Count')
plt.ylabel('Density')

plt.legend()
# Show the plot
plt.show()


In [None]:
# Filter out NaN values before plotting
non_duplicate_word_share = df[df['is_duplicate'] == 0]['word_share'].dropna()
duplicate_word_share = df[df['is_duplicate'] == 1]['word_share'].dropna()

# Plot the distribution
sns.distplot(non_duplicate_word_share, label='non duplicate')
sns.distplot(duplicate_word_share, label='duplicate')

plt.xlabel('Word Share')
plt.ylabel('Density')

plt.legend()

plt.show()


In [None]:
df.head()

## Applying nlp concepts to convert text data into numerical data

In [None]:

required_columns = ['question1', 'question2', 'is_duplicate']
Quora = df[required_columns]
Quora


### Cleaning data by removing punctuation,whitespace,numbers,stopwords ...

In [None]:
import nltk
nltk.download('stopwords')
import string
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer

def clean_text(text):
    # Check if the input is a string
    if isinstance(text, str):
        # lower text
        text = text.lower()
        # tokenize text using WhitespaceTokenizer
        tokenizer = WhitespaceTokenizer()
        tokens = tokenizer.tokenize(text)
        # remove punctuation
        tokens = [word.strip(string.punctuation) for word in tokens]
        # remove words that contain numbers
        tokens = [word for word in tokens if not any(c.isdigit() for c in word)]
        # remove stop words
        stop = stopwords.words('english')
        stop = [w for w in stop if w not in ['not', 'no']]
        tokens = [x for x in tokens if (x not in stop)]
        # remove empty tokens
        tokens = [t for t in tokens if len(t) > 0]
        # remove words with only one letter
        tokens = [t for t in tokens if len(t) > 1]
        # join all
        text = " ".join(tokens)
        return text
    else:
        # If the input is not a string, return an empty string or handle it as needed
        return ""

In [None]:
Quora["question1_data"] = Quora["question1"].apply(lambda x: clean_text(x))
Quora["question2_data"] = Quora["question2"].apply(lambda x: clean_text(x))

In [None]:
Quora.drop(Quora.columns[[0, 1]], axis=1, inplace=True)

In [None]:
Quora.head()

### remove_contractions, replace_currency_symbols, remove_hyperlinks, remove_html_tags

In [None]:
import string
import re
from multiprocessing import Pool, cpu_count

def remove_contractions(text):
    contractions = {
        "don't": "do not",
        "won't": "will not",
        "can't": "cannot",
        "I'm": "I am",
        "you're": "you are",
        "he's": "he is",
        "she's": "she is",
        "it's": "it is",
        "we're": "we are",
        "they're": "they are"
    }
    for contraction, expansion in contractions.items():
        text = text.replace(contraction, expansion)
    return text

def replace_currency_symbols(text):
    currency_symbols = {
        "$": "USD",
        "€": "EUR",
        "£": "GBP",
        "¥": "JPY",
        "₹": "INR",
        "₽": "RUB",  # Russian Ruble
        "₩": "KRW",  # South Korean Won
        "฿": "THB",  # Thai Baht
        "₴": "UAH",  # Ukrainian Hryvnia
        "₦": "NGN"
    }
    for symbol, currency_name in currency_symbols.items():
        text = text.replace(symbol, currency_name)
    return text

def remove_hyperlinks(text):
    text = re.sub(r'http\S+', '', text)
    return text

def remove_html_tags(text):
    text = re.sub(r'<.*?>', '', text)
    return text

def process_column(column):
    with Pool(cpu_count()) as pool:
        processed_column = pool.map(remove_contractions, column)
        processed_column = pool.map(replace_currency_symbols, processed_column)
        processed_column = pool.map(remove_hyperlinks, processed_column)
        processed_column = pool.map(remove_html_tags, processed_column)
    return processed_column


In [None]:
Quora['question1_data'] = process_column(Quora['question1_data'])
Quora['question2_data'] = process_column(Quora['question2_data'])

In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
# Combine all questions for training Word2Vec
all_questions = list(Quora['question1_data']) + list(Quora['question2_data'])
# Train Word2Vec model
word2vec_model = Word2Vec(sentences=all_questions, size=100, window=5, min_count=1, workers=4)


## Training the data into train and test 

In [None]:
def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    nwords = 0.

    for word in words:
        if word in vocabulary:
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model.wv[word])

    if nwords:
        feature_vector = np.divide(feature_vector, nwords)

    return feature_vector
def word2vec_vectorizer(data, model, num_features):
    if hasattr(model.wv, 'index_to_key'):
        vocabulary = set(model.wv.index_to_key)
    else:
        vocabulary = set(model.wv.index2word)

    features = [average_word_vectors(question, model, vocabulary, num_features) for question in data]
    return np.array(features)

# Vectorize questions
X = word2vec_vectorizer(Quora['question1_data'], word2vec_model, 100)
Y = word2vec_vectorizer(Quora['question2_data'], word2vec_model, 100)
X_train, X_test, Y_train, Y_test = train_test_split(np.hstack((X, Y)), Quora['is_duplicate'], test_size=0.2, random_state=42)


## RandomForestClassifier 

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score

# Define the RandomForestClassifier
clf = RandomForestClassifier(random_state=42)

# Define the hyperparameters and their possible values
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Use accuracy as the scoring metric for grid search
scorer = make_scorer(accuracy_score)

# Perform Grid Search
grid_search = GridSearchCV(clf, param_grid, scoring=scorer, cv=5)
grid_search.fit(X_train, Y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Train the model with the best hyperparameters
best_clf = grid_search.best_estimator_
best_clf.fit(X_train, Y_train)

# Make predictions and evaluate
predictions = best_clf.predict(X_test)
accuracy = accuracy_score(Y_test, predictions)
print(f'Accuracy: {accuracy}')
print(classification_report(Y_test, predictions))


In [None]:
from sklearn.model_selection import cross_val_score

# Assuming clf is your RandomForestClassifier instance
cv_scores = cross_val_score(clf, X_train, Y_train, cv=5, scoring='accuracy')

# Print cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", np.mean(cv_scores))


#### Confusion Matrix:

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, predictions)
print("Confusion Matrix:\n", cm)


#### Precision, Recall, and F1-Score:

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
precision = precision_score(Y_test, predictions)
recall = recall_score(Y_test, predictions)
f1 = f1_score(Y_test, predictions)
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')


#### Receiver Operating Characteristic (ROC) Curve and Area Under the Curve (AUC):

In [None]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(Y_test, predictions)
roc_auc = auc(fpr, tpr)
print(f'ROC AUC: {roc_auc}')


## SVC Classifier

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Assuming X_train, X_test, Y_train, Y_test are properly defined
clf_svc = SVC(random_state=42)
clf_svc.fit(X_train, Y_train)

# Make predictions
predictions_svc = clf_svc.predict(X_test)

# Evaluate the model
accuracy_svc = accuracy_score(Y_test, predictions_svc)
print(f'Accuracy: {accuracy_svc}')
print(classification_report(Y_test, predictions_svc))


## KNeighborsClassifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Assuming X_train, X_test, Y_train, Y_test are properly defined
clf_knn = KNeighborsClassifier()
clf_knn.fit(X_train, Y_train)

# Make predictions
predictions_knn = clf_knn.predict(X_test)

# Evaluate the model
accuracy_knn = accuracy_score(Y_test, predictions_knn)
print(f'Accuracy: {accuracy_knn}')
print(classification_report(Y_test, predictions_knn))


## XGBClassifier

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Assuming X_train, X_test, Y_train, Y_test are properly defined
clfxgb = XGBClassifier(random_state=42)
clfxgb.fit(X_train, Y_train)

# Make predictions
predictions_xgb = clfxgb.predict(X_test)

# Evaluate the model
accuracy_xgb = accuracy_score(Y_test, predictions_xgb)
print(f'Accuracy: {accuracy_xgb}')
print(classification_report(Y_test, predictions_xgb))
