In [1]:
# Importing necessary libraries
import pandas as pd  # data manipulation and analysis
import matplotlib.pyplot as plt  # plotting and visualization
import re  # regular expressions
import string  # string manipulation
from nltk import download  # for downloading nltk data
from nltk.corpus import stopwords  # for stopwords
from nltk.stem import WordNetLemmatizer  # for word lemmatization
import seaborn as sns  # statistical data visualization
import contractions
from nltk.tokenize import word_tokenize

# Downloading required nltk data
download('stopwords')
download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rmora\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rmora\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Loading the dataset
df = pd.read_csv("../Data/ProjectTweets.csv", header=None)

# Display the first 5 rows of the dataset for a quick initial exploration
df.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,1,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,2,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,3,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,4,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [3]:
# Display information about the dataset (data types, number of non-null values, etc.)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   0       1600000 non-null  int64 
 1   1       1600000 non-null  int64 
 2   2       1600000 non-null  object
 3   3       1600000 non-null  object
 4   4       1600000 non-null  object
 5   5       1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


### Note: 
It seems like there are no missing values in the dataset.  
I will add headers to the columns for a better understanding of the data.

### Dataset Columns:
- `IDs`: The id of the tweet (e.g., 4587)
- `Date`: The date of the tweet (e.g., Sat May 16 23:58:44 UTC 2009)
- `Flag`: The query (e.g., lyx). If there is no query, then this value is NO_QUERY.
- `user`: The user that tweeted (e.g., bobthebuilder)
- `text`: The text of the tweet (e.g., Lyx is cool)

In [4]:
# Define column names for the dataset
column_names = ["Index", "IDs", "Date", "Flag", "user", "text"]

# Reload the dataset with column names
df = pd.read_csv("../Data/ProjectTweets.csv", header=None, names=column_names)

# Display the first 5 rows of the dataset with column names
df.head()

Unnamed: 0,Index,IDs,Date,Flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,1,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,2,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,3,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,4,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
# Checking the number of unique values in 'Index' and 'IDs' columns
unique_index = df["Index"].nunique()
unique_ids = df["IDs"].nunique()

# Displaying the number of unique values
print(f"Number of unique values in 'Index': {unique_index}")
print(f"Number of unique values in 'IDs': {unique_ids}")

Number of unique values in 'Index': 1600000
Number of unique values in 'IDs': 1598315


### Notes on Unique Values:
- The first column, 'Index', seems to be just an index and therefore is not needed.
- The second column, 'IDs', appears to have a few duplicates. As I assumed that 'IDs' should be unique, I will check for and remove any duplicate rows.

In [6]:
# Dropping the 'Index' column
df = df.drop(columns="Index")

In [7]:
# Checking for duplicates in the dataset
duplicates = df.duplicated().sum()

# Displaying the number of duplicate rows
print(f"Number of duplicate rows: {duplicates}")

Number of duplicate rows: 1685


### Notes on Duplicates:
- As assumed, 'IDs' is a unique value for each tweet. Therefore, I can drop the 'IDs' column and remove any duplicate tweets to ensure data consistency.

In [8]:
# Dropping duplicate rows
df = df.drop_duplicates()

# Displaying the dataset information after dropping duplicates
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1598315 entries, 0 to 1599999
Data columns (total 5 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   IDs     1598315 non-null  int64 
 1   Date    1598315 non-null  object
 2   Flag    1598315 non-null  object
 3   user    1598315 non-null  object
 4   text    1598315 non-null  object
dtypes: int64(1), object(4)
memory usage: 73.2+ MB
None


In [9]:
# Checking unique values in 'Flag' column
unique_flags = df["Flag"].unique()

# Displaying unique values in 'Flag'
display(f"Unique values in 'Flag': {unique_flags}")

"Unique values in 'Flag': ['NO_QUERY']"

### Notes on 'Flag' Column:
- The 'Flag' column will be dropped as it contains only one unique value, which is not relevant for this analysis.

In [10]:
# Dropping the 'Flag' column
df = df.drop(columns="Flag")

In [11]:
# Save the DataFrame to a CSV file
df.to_csv('ProjectTweets_cleaned.csv', index=True)