In [1]:
import pandas as pd
import numpy as np
import tqdm

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import spacy
from torch.utils.data import TensorDataset, DataLoader
from sklearn.utils import resample
from torch.nn.utils.rnn import pad_sequence

import torch
from torchtext import data
from sklearn.metrics import f1_score
import pandas as pd 
import re
import random

import torch.nn as nn
import torch.nn.functional as F
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
import torch.optim as optim

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
raw_train_df = pd.read_csv("./data/train.csv")
raw_test_df = pd.read_csv("./data/test.csv")
raw_train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
del raw_train_df['keyword']
del raw_train_df['location']
del raw_train_df['id']

In [5]:
def text_cleaner(text):
    text = text.lower() # convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # remove punctuation and special characters
    text = re.sub(r'\s+', ' ', text) # remove extra whitespace
    text = re.sub(r'https?://\S+', '', text) # remove URLs
    text = re.sub(r"#", "", text)
    return text

In [6]:
# Replaces the null values in the data with an empty string
train_data = raw_train_df.where((pd.notnull(raw_train_df)),'')
test_data = raw_test_df.where((pd.notnull(raw_test_df)),'')

train_data['text'] = train_data['text'].apply(text_cleaner)
test_data['text'] = test_data['text'].apply(text_cleaner)

In [7]:
# X=train_data['text'].apply(gensim.utils.simple_preprocess)
# Y = train_data['target']
# model.wv["holding"].shape

df_majority = train_data[train_data['target'] == 0]
df_minority = train_data[train_data['target'] == 1]

# Upsample the minority class
df_minority_upsampled = df_minority.sample(replace=True, n=len(df_majority), random_state=123)

# Combine the majority class and the upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

# Shuffle the rows of the new dataframe
train_data = df_upsampled.sample(frac=1).reset_index(drop=True)

X, Y = train_data['text'], train_data['target']

X_test= test_data['text']

In [15]:
def embed(docs):
    nlp = spacy.load('en_core_web_md')
    docs_tensor = []
    pbar = tqdm.trange(docs.shape[0])
    for t in pbar:
        doc = nlp(docs[t])
        sentence_embeddings = [token.vector for token in doc]
        docs_tensor.append(sentence_embeddings)

    docs_tensor = [torch.tensor(d) for d in docs_tensor]
    docs_tensor = pad_sequence(docs_tensor, batch_first=True)

    print(docs_tensor.shape)
    return docs_tensor

In [16]:
X_tensor = embed(X)
X_test_tensor = embed(X_test)

100%|█████████████████████████████████████████████████████████████████████████████| 8684/8684 [00:39<00:00, 218.08it/s]


torch.Size([8684, 32, 300])


100%|█████████████████████████████████████████████████████████████████████████████| 3263/3263 [00:15<00:00, 213.80it/s]


torch.Size([3263, 33, 300])


In [17]:
X_train, X_val, Y_train, Y_val = train_test_split(X_tensor, Y, test_size=0.2, random_state= 3)


Y_train = torch.from_numpy(Y_train.values)
Y_val = torch.from_numpy(Y_val.values)


train_dataset = TensorDataset(X_train, Y_train)

# Create a DataLoader for the dataset
batch_size = 50
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)


val_dataset = TensorDataset(X_val, Y_val)

val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)


test_dataset = TensorDataset(X_test_tensor)

test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

print(X_train.shape, X_val.shape, X_test_tensor.shape)

torch.Size([6947, 32, 300]) torch.Size([1737, 32, 300]) torch.Size([3263, 33, 300])


In [39]:
class GRU(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(GRU, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
#         self.rnn = nn.RNN(input_size, hidden_size,num_layers, batch_first = True)
        self.gru = nn.GRU(input_size, hidden_size,num_layers, dropout=0.2, batch_first = True)
#         self.lstm = nn.LSTM(input_size, hidden_size,num_layers, batch_first = True)
        #  the input x has to be in this format x -> (batch_size, seq, input_size)
        self.fc = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        h_0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
#         c_0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        # below output will be the out put of all the time steps but we need of only the latest
#         out, _ = self.rnn(x, h_0)
        out, _ = self.gru(x, h_0)
#         out, _ = self.gru(x, (h_0,c_0))
        # outputs is how dimensions batch_size, seq_length, hiden_size
        # out (N, 28, 128)
        out = out[:,-1,:]
        # out (N, 128)
        out = self.fc(out)
        return out

In [45]:
# we will keep feeding the RNN one row at a time 
input_size = 300
n_hidden = 2
hidden_dims = 128

n_categories = 2
num_epochs = 10
learning_rate = 0.0007
all_losses = []

model = GRU(input_size, hidden_dims, n_hidden, n_categories)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)


In [46]:
def category_from_output(output):
    category_index = torch.argmax(output).item()
    return category_index

for epoch in range(num_epochs):
    iteration = 0
    n_correct = 0
    n_samples = 0
    for batch_X, batch_Y in train_dataloader:
        # Zero the gradients
        optimizer.zero_grad()
        # Forward pass
        output = model(batch_X)
        # Compute the loss
        loss = criterion(output, batch_Y)
        
        _, predictions = torch.max(output, 1)
        n_samples  += batch_Y.shape[0]
        n_correct += (predictions == batch_Y).sum().item()
        acc = 100 * n_correct / n_samples
        

        # Backward pass
        loss.backward()
        optimizer.step()
        iteration += 1
        if iteration%50==0:
            print(f'Training accuracy ={acc:.4f}')
            print(f'epoch: {epoch+1}, step {iteration+1}, loss = {loss.item():.4f}')

Training accuracy =49.8400
epoch: 1, step 51, loss = 0.7389
Training accuracy =58.0800
epoch: 1, step 101, loss = 0.5234
Training accuracy =83.1200
epoch: 2, step 51, loss = 0.3768
Training accuracy =82.6200
epoch: 2, step 101, loss = 0.5368
Training accuracy =87.6400
epoch: 3, step 51, loss = 0.3081
Training accuracy =88.2000
epoch: 3, step 101, loss = 0.3203
Training accuracy =92.0800
epoch: 4, step 51, loss = 0.2226
Training accuracy =91.8200
epoch: 4, step 101, loss = 0.4586
Training accuracy =94.3200
epoch: 5, step 51, loss = 0.4396
Training accuracy =94.1600
epoch: 5, step 101, loss = 0.2977
Training accuracy =95.2000
epoch: 6, step 51, loss = 0.2361
Training accuracy =95.1000
epoch: 6, step 101, loss = 0.1901
Training accuracy =95.4400
epoch: 7, step 51, loss = 0.0909
Training accuracy =95.5000
epoch: 7, step 101, loss = 0.2394
Training accuracy =96.0800
epoch: 8, step 51, loss = 0.0426
Training accuracy =96.0600
epoch: 8, step 101, loss = 0.0380
Training accuracy =96.8000
epoch

In [44]:
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    for batch_X, batch_Y in val_dataloader:
        # Zero the gradients
        outputs = model(batch_X)
        
        _, predictions = torch.max(outputs, 1)
        n_samples  += batch_Y.shape[0]
        n_correct += (predictions == batch_Y).sum().item()
    acc = 100* n_correct / n_samples
    print(f'accuracy ={acc:.4f}')

accuracy =85.6074


In [30]:
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predictions = torch.max(outputs, 1)
submission = pd.DataFrame({'id': test_data['id'], 'target': predictions})
submission.to_csv('submission.csv', index=False)

In [137]:
submission.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [None]:

import pygame
import random

random.seed(20)

# Define the maze dimensions
maze_width = 10
maze_height = 10

# Define the maze start and end points
start = (0, 0)
end = (maze_width - 1, maze_height - 1)

# Define the maze walls
walls = set()
for i in range(maze_width):
    walls.add((i, -1))
    walls.add((i, maze_height))
for j in range(maze_height):
    walls.add((-1, j))
    walls.add((maze_width, j))
    
for i in range(5):
    x = random.randint(0, maze_width - 1)
    y = random.randint(0, maze_height - 1)
    walls.add((x, y))

# Generate the maze using a randomized depth-first search algorithm
stack = [start]
visited = set()
while stack:
    current_cell = stack.pop()
    if current_cell == end:
        break
    visited.add(current_cell)
    neighbors = []
    x, y = current_cell
    if (x + 1, y) not in visited and (x + 1, y) not in walls:
        neighbors.append((x + 1, y))
    if (x - 1, y) not in visited and (x - 1, y) not in walls:
        neighbors.append((x - 1, y))
    if (x, y + 1) not in visited and (x, y + 1) not in walls:
        neighbors.append((x, y + 1))
    if (x, y - 1) not in visited and (x, y - 1) not in walls:
        neighbors.append((x, y - 1))
    if neighbors:
        stack.append(current_cell)
        next_cell = random.choice(neighbors)
        walls.discard(((current_cell[0] + next_cell[0]) // 2, (current_cell[1] + next_cell[1]) // 2))
        stack.append(next_cell)

# Define the player's starting position
player_pos = start

# Initialize Pygame
pygame.init()
screen = pygame.display.set_mode((maze_width * 30, maze_height * 30))
clock = pygame.time.Clock()

# Define the game loop
while True:
    # Handle events
    for event in pygame.event.get():
        if event.type == pygame.QUIT:
            pygame.quit()
            quit()
        elif event.type == pygame.KEYDOWN:
            if event.key == pygame.K_UP and (player_pos[0], player_pos[1] - 1) not in walls and player_pos[1] > 0:
                player_pos = (player_pos[0], player_pos[1] - 1)
            elif event.key == pygame.K_DOWN and (player_pos[0], player_pos[1] + 1) not in walls and player_pos[1] < maze_height - 1:
                player_pos = (player_pos[0], player_pos[1] + 1)
            elif event.key == pygame.K_LEFT and (player_pos[0] - 1, player_pos[1]) not in walls and player_pos[0] > 0:
                player_pos = (player_pos[0] - 1, player_pos[1])
            elif event.key == pygame.K_RIGHT and (player_pos[0] + 1, player_pos[1]) not in walls and player_pos[0] < maze_width - 1:
                player_pos = (player_pos[0] + 1, player_pos[1])
            elif event.key == pygame.K_ESCAPE:
                pygame.quit()
                quit()

    # Draw the maze
        screen.fill((255, 255, 255))
        for j in range(maze_height):
            for i in range(maze_width):
                if (i, j) in walls:
                    pygame.draw.rect(screen, (0, 0, 0), (i * 30, j * 30, 30, 30))
        pygame.draw.rect(screen, (0, 255, 0), (start[0] * 30, start[1] * 30, 30, 30))
        pygame.draw.rect(screen, (255, 0, 0), (end[0] * 30, end[1] * 30, 30, 30))

        # Draw the player
        pygame.draw.rect(screen, (0, 0, 255), (player_pos[0] * 30, player_pos[1] * 30, 30, 30))

        # Update the display
        pygame.display.update()

        # Check for win condition
        if player_pos == end:
            print("You win!")
            pygame.quit()
            quit()

        # Tick the clock
        clock.tick(60)