In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=d454ad431a89f44671677ccf0afafb6fae7f2fbff3e6c9c39facf03d5e68a97b
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count
from google.colab import files
import io

In [None]:
# Initialize Spark session
spark = SparkSession.builder.appName("DuplicateCheck").getOrCreate()



In [None]:
# Read the CSV file
df = spark.read.csv("QuAns.csv", header=True, inferSchema=True)


In [None]:

# Count total rows
total_rows = df.count()

# Count distinct rows
distinct_rows = df.distinct().count()

# Calculate number of duplicate rows
duplicate_rows = total_rows - distinct_rows



In [None]:
print(f"Total rows: {total_rows}")
print(f"Distinct rows: {distinct_rows}")
print(f"Duplicate rows: {duplicate_rows}")

Total rows: 12
Distinct rows: 10
Duplicate rows: 2


In [None]:
df.show(truncate=False)

+-------------------------------------------------+-------------------+
|question                                         |answer             |
+-------------------------------------------------+-------------------+
|What is the capital of France?                   |Paris              |
|"Who wrote ""Romeo and Juliet""?"                |William Shakespeare|
|What is the largest planet in our solar system?  |Jupiter            |
|What is the chemical symbol for gold?            |Au                 |
|What year did World War II end?                  |1945               |
|Who painted the Mona Lisa?                       |Leonardo da Vinci  |
|What is the capital of Japan?                    |Tokyo              |
|What is the chemical symbol for gold?            |Au                 |
|Who discovered gravity?                          |Isaac Newton       |
|What is the largest ocean on Earth?              |Pacific Ocean      |
|What year did World War II end?                  |1945         

In [None]:
# Remove duplicates
df_no_duplicates = df.dropDuplicates()



In [None]:
# Count rows after removing duplicates
no_duplicate_rows = df_no_duplicates.count()
print(f"Number of rows after removing duplicates: {no_duplicate_rows}")




Number of rows after removing duplicates: 10


In [None]:
# Calculate number of duplicate rows removed
duplicate_rows_removed = total_rows - no_duplicate_rows
print(f"Number of duplicate rows removed: {duplicate_rows_removed}")



Number of duplicate rows removed: 2


In [None]:
# Write the deduplicated dataset to a new CSV file
df_no_duplicates.write.csv("qa_dataset_no_duplicates.csv", header=True, mode="overwrite")


In [None]:

print("New CSV file 'qa_dataset_no_duplicates.csv' has been created without duplicates.")



New CSV file 'qa_dataset_no_duplicates.csv' has been created without duplicates.


In [None]:
# Display the first few rows of the deduplicated dataset
print("\nFirst few rows of the deduplicated dataset:")
df_no_duplicates.show(5, truncate=False)


First few rows of the deduplicated dataset:
+-------------------------------------------------+----------+
|question                                         |answer    |
+-------------------------------------------------+----------+
|"Who is the author of ""To Kill a Mockingbird""?"|Harper Lee|
|What year did World War II end?                  |1945      |
|What is the chemical symbol for gold?            |Au        |
|What is the capital of Japan?                    |Tokyo     |
|What is the largest planet in our solar system?  |Jupiter   |
+-------------------------------------------------+----------+
only showing top 5 rows



In [None]:
type(df_no_duplicates)

In [None]:


# Convert PySpark DataFrame to CSV string
csv_data = df_no_duplicates.toPandas().to_csv(index=False)



In [None]:
# Save the CSV string to a file in Colab
output_file_name = 'no_duplicates.csv'
with open(output_file_name, 'w') as f:
    f.write(csv_data)

print(f"New CSV file '{output_file_name}' has been created without duplicates.")



New CSV file 'no_duplicates.csv' has been created without duplicates.


In [None]:
# Display the first few rows of the deduplicated dataset
print("\nFirst few rows of the deduplicated dataset:")
df_no_duplicates.show(5, truncate=False)




First few rows of the deduplicated dataset:
+-------------------------------------------------+----------+
|question                                         |answer    |
+-------------------------------------------------+----------+
|"Who is the author of ""To Kill a Mockingbird""?"|Harper Lee|
|What year did World War II end?                  |1945      |
|What is the chemical symbol for gold?            |Au        |
|What is the capital of Japan?                    |Tokyo     |
|What is the largest planet in our solar system?  |Jupiter   |
+-------------------------------------------------+----------+
only showing top 5 rows



In [None]:
# Provide a download link for the new CSV file
files.download(output_file_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
type(df_no_duplicates)

In [None]:
df_no_duplicates.show(truncate=False)

+-------------------------------------------------+-------------------+
|question                                         |answer             |
+-------------------------------------------------+-------------------+
|"Who is the author of ""To Kill a Mockingbird""?"|Harper Lee         |
|What year did World War II end?                  |1945               |
|What is the chemical symbol for gold?            |Au                 |
|What is the capital of Japan?                    |Tokyo              |
|What is the largest planet in our solar system?  |Jupiter            |
|Who discovered gravity?                          |Isaac Newton       |
|What is the capital of France?                   |Paris              |
|Who painted the Mona Lisa?                       |Leonardo da Vinci  |
|"Who wrote ""Romeo and Juliet""?"                |William Shakespeare|
|What is the largest ocean on Earth?              |Pacific Ocean      |
+-------------------------------------------------+-------------

In [None]:
csv_data = df_no_duplicates.toPandas().to_csv(index=False)

In [None]:
csv_data

'question,answer\n"""Who is the author of """"To Kill a Mockingbird""""?""",Harper Lee\nWhat year did World War II end?,1945\nWhat is the chemical symbol for gold?,Au\nWhat is the capital of Japan?,Tokyo\nWhat is the largest planet in our solar system?,Jupiter\nWho discovered gravity?,Isaac Newton\nWhat is the capital of France?,Paris\nWho painted the Mona Lisa?,Leonardo da Vinci\n"""Who wrote """"Romeo and Juliet""""?""",William Shakespeare\nWhat is the largest ocean on Earth?,Pacific Ocean\n'