# **Download Datasets**



In [1]:
# Download necessary packages
!pip install gdown
!pip install pandas

# Download Haiti Dataset
!gdown 1lyemu9dz6L1YS-MTNU25KWSN4X0AydXa # Train
!gdown 1eZtMBEO3kkpd3WzkPeWR2-6DK2uygLNR # Test

# Download Sandy Dataset
!gdown 14AG0JK9t6iYR4nvWhxuU6QyBhoOYD-_5 # Train
!gdown 1gpKC6Ks0nQDkZ8TOBjKZeblhIJiG520e # Test

Downloading...
From: https://drive.google.com/uc?id=1lyemu9dz6L1YS-MTNU25KWSN4X0AydXa
To: /content/haiti_train.csv
100% 172k/172k [00:00<00:00, 64.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=1eZtMBEO3kkpd3WzkPeWR2-6DK2uygLNR
To: /content/haiti_test.csv
100% 43.9k/43.9k [00:00<00:00, 51.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=14AG0JK9t6iYR4nvWhxuU6QyBhoOYD-_5
To: /content/sandy_train.csv
100% 155k/155k [00:00<00:00, 61.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=1gpKC6Ks0nQDkZ8TOBjKZeblhIJiG520e
To: /content/sandy_test.csv
100% 40.2k/40.2k [00:00<00:00, 53.5MB/s]


# **Load and Combine Datasets**

In [2]:
import pandas as pd

# Load the datasets
haiti_train_df = pd.read_csv('haiti_train.csv')
haiti_test_df = pd.read_csv('haiti_test.csv')
sandy_train_df = pd.read_csv('sandy_train.csv')
sandy_test_df = pd.read_csv('sandy_test.csv')

# Combine train and test datasets for Haiti and Sandy
haiti_df = pd.concat([haiti_train_df, haiti_test_df], ignore_index=True)
sandy_df = pd.concat([sandy_train_df, sandy_test_df], ignore_index=True)

# **Data Cleaning**

In [3]:
# Handle missing values by dropping rows with missing data
haiti_df = haiti_df.dropna()
sandy_df = sandy_df.dropna()

# Remove duplicate entries
haiti_df = haiti_df.drop_duplicates()
sandy_df = sandy_df.drop_duplicates()

# **Data Labeling and Categorization**

In [6]:
# Convert labels to numeric values
label_mapping = {'Food': 0, 'Water': 1, 'Energy': 2, 'Medical': 3, 'N/A': 4}

haiti_df['Label'] = haiti_df['Label'].map(label_mapping)
sandy_df['Label'] = sandy_df['Label'].map(label_mapping)


# **Text Preprocessing**

In [8]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenization
    tokens = text.split()
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # Stemming
    tokens = [stemmer.stem(word) for word in tokens]
    # Join tokens back to string
    return ' '.join(tokens)

# Apply preprocessing
haiti_df['processed_text'] = haiti_df['Text'].apply(preprocess_text)
sandy_df['processed_text'] = sandy_df['Text'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Feature **Engineering**

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)  # Adjust max_features as needed

haiti_tfidf = vectorizer.fit_transform(haiti_df['processed_text'])
sandy_tfidf = vectorizer.fit_transform(sandy_df['processed_text'])

# Feature Selection (Optional, if needed)

In [10]:
from sklearn.decomposition import PCA

# Feature selection using PCA
pca = PCA(n_components=100)  # Adjust n_components as needed

haiti_tfidf_pca = pca.fit_transform(haiti_tfidf.toarray())
sandy_tfidf_pca = pca.fit_transform(sandy_tfidf.toarray())


# Save Preprocessed **Datasets**

In [11]:
# Add the TF-IDF vectors to the DataFrame
haiti_tfidf_df = pd.DataFrame(haiti_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
sandy_tfidf_df = pd.DataFrame(sandy_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

haiti_preprocessed_df = pd.concat([haiti_df.reset_index(drop=True), haiti_tfidf_df], axis=1)
sandy_preprocessed_df = pd.concat([sandy_df.reset_index(drop=True), sandy_tfidf_df], axis=1)

# Save the preprocessed Haiti dataset to a CSV file
haiti_preprocessed_df.to_csv('haiti_preprocessed.csv', index=False)

# Save the preprocessed Sandy dataset to a CSV file
sandy_preprocessed_df.to_csv('sandy_preprocessed.csv', index=False)

# Download Preprocessed Datasets

In [12]:
from google.colab import files

# Download the preprocessed Haiti dataset
files.download('haiti_preprocessed.csv')

# Download the preprocessed Sandy dataset
files.download('sandy_preprocessed.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>