In [None]:
# prompt: ayni bu isi yapan bir RNN modeli deneyebilirz lutfen butun kod seklinde bana. bu modeli olusturur musun

import nltk
import gensim.downloader as api
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from nltk.stem import WordNetLemmatizer
import numpy as np
from gensim.models import KeyedVectors
from gensim import downloader as api
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

nltk.download('wordnet')
nltk.download('punkt')
!pip install openpyxl


df = pd.read_excel('Euraxess_Satcom.xlsx')

columns = ['Title', 'OfferDescription', 'Requirements', 'Responsibilities', 'AdditionalInformation']
existing_columns = [col for col in columns if col in df.columns]


# Combine relevant columns into a single text column, handling missing values
df['combined_text'] = df[existing_columns].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)

# Handle empty combined texts
df['combined_text'] = df['combined_text'].apply(lambda x: x if x else 'NA')

# Lemmatization
lemmatizer = WordNetLemmatizer()
df['lemmatized_text'] = df['combined_text'].apply(lambda text: ' '.join([lemmatizer.lemmatize(word) for word in nltk.word_tokenize(text)]))

# Keyword-based labeling
keywords = [
    'aerospace', 'geolocation', 'satellite', 'navigation', 'GPS', 'GNSS', 'GIS',
    'remote sensing', 'UAV', 'drone', 'positioning', 'earth observation', 'orbit',
    'launch vehicle', 'spacecraft', 'cosmonaut', 'astronaut', 'cartography',
    'geospatial', 'surveying', 'remote sensing imagery', 'geodetic', 'positioning system',  'space', 'geodesy', 'mapping', 'photogrammetry', 'lidar', 'radar', 'earth science',
    'geophysical', 'geospatial analysis', 'location based services', 'lbs', 'navigation system',
    'satellite imagery', 'aerial imagery', 'geospatial data', 'geodata', 'geomatics'
]
df['label'] = df['lemmatized_text'].apply(lambda text: 1 if any(keyword in text.lower() for keyword in keywords) else 0)

# Word2Vec embeddings
word2vec_model = api.load("word2vec-google-news-300")

def sentence_to_vec(sentence):
    words = nltk.word_tokenize(sentence)
    vectors = [word2vec_model[word] for word in words if word in word2vec_model.key_to_index]
    return np.mean(vectors, axis=0) if vectors else np.zeros(300)

df['word2vec'] = df['lemmatized_text'].apply(sentence_to_vec)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df['word2vec'].tolist(), df['label'], test_size=0.2, random_state=42)
X_train = np.array(X_train)
X_test = np.array(X_test)

# RNN model
model = Sequential()
model.add(Embedding(len(word2vec_model.key_to_index), 300, weights=[word2vec_model.vectors], trainable=False))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32)

# Predictions and evaluation
y_pred = (model.predict(X_test) > 0.5).astype(int)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

# Predict for the whole dataset
df['prediction'] = (model.predict(df['word2vec'].tolist()) > 0.5).astype(int)
print(df['prediction'].value_counts())

# Inspect misclassified samples
false_negatives = df[(df['label'] == 1) & (df['prediction'] == 0)]
print(false_negatives[['lemmatized_text', 'label', 'prediction']])
