In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import spacy

In [2]:
df = pd.read_csv("qa_dataset.csv",encoding="latin-1")
df

Unnamed: 0,Question,Answer
0,What are the main goals and objectives of your...,The main goals and objectives of our business ...
1,"What are the strengths, weaknesses, opportunit...",Strengths:\n1. Established brand reputation.\n...
2,Who are your target customers and what are the...,Our target customers encompass a diverse range...
3,How do you differentiate your products or serv...,We differentiate our products and services thr...
4,What is your unique value proposition or compe...,Our unique value proposition lies in our abili...
...,...,...
1121,What are the short-term and long-term plans an...,Short-term plans and strategies for business g...
1122,What is the difference between a sole propriet...,#### Difference between a Sole Proprietorship ...
1123,What are the four Ps of marketing,"The four Ps of marketing are **Product, Price,..."
1124,What is a SWOT analysis and how can it help a ...,#### What is a SWOT analysis and how can it he...


In [3]:
df.dropna(subset=['Question', 'Answer'], inplace=True)

In [4]:
def clean_text(text):
    # Remove unnecessary characters, symbols, and HTML tags
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

In [5]:
df['cleaned_question'] = df['Question'].apply(clean_text)
df['cleaned_answer'] = df['Answer'].apply(clean_text)


In [6]:
df['cleaned_question'] = df['cleaned_question'].str.lower()
df['cleaned_answer'] = df['cleaned_answer'].str.lower()

In [7]:
df['tokenized_question'] = df['cleaned_question'].apply(word_tokenize)
df['tokenized_answer'] = df['cleaned_answer'].apply(word_tokenize)

In [8]:
stop_words = set(stopwords.words('english'))

df['filtered_question'] = df['tokenized_question'].apply(lambda tokens: [word for word in tokens if word not in stop_words])
df['filtered_answer'] = df['tokenized_answer'].apply(lambda tokens: [word for word in tokens if word not in stop_words])

In [9]:
lemmatizer = WordNetLemmatizer()

df['lemmatized_question'] = df['filtered_question'].apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])
df['lemmatized_answer'] = df['filtered_answer'].apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])

In [10]:
print(df[['Question', 'lemmatized_question', 'Answer', 'lemmatized_answer']].head())


                                            Question   
0  What are the main goals and objectives of your...  \
1  What are the strengths, weaknesses, opportunit...   
2  Who are your target customers and what are the...   
3  How do you differentiate your products or serv...   
4  What is your unique value proposition or compe...   

                                 lemmatized_question   
0                  [main, goal, objective, business]  \
1  [strength, weakness, opportunity, threat, swot...   
2  [target, customer, need, preference, pain, point]   
3      [differentiate, product, service, competitor]   
4  [unique, value, proposition, competitive, adva...   

                                              Answer   
0  The main goals and objectives of our business ...  \
1  Strengths:\n1. Established brand reputation.\n...   
2  Our target customers encompass a diverse range...   
3  We differentiate our products and services thr...   
4  Our unique value proposition lies in our ab

In [11]:
df.to_csv("your_preprocessed_dataset", index = False)