In [69]:
import nltk
nltk.download('wordnet')
import gensim.downloader as api
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from nltk.stem import WordNetLemmatizer


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [70]:

df = pd.read_excel('SpaceIndividuals_GNSS.xlsx')

columns = ['Title', 'OfferDescription', 'Requirements', 'Responsibilities', 'AdditionalInformation']
existing_columns = [col for col in columns if col in df.columns]


input_texts = df[existing_columns].apply(lambda x: '. '.join(x.dropna().astype(str)), axis=1).tolist()
input_texts = [text if text else 'NA' for text in input_texts]

In [71]:

lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    words = nltk.word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmas)

# Apply lemmatization to input texts
input_texts = [lemmatize_text(text) for text in input_texts]

vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 3))
X = vectorizer.fit_transform(input_texts)

In [72]:
def determine_label(text):
    keywords = [
    'aerospace', 'geolocation', 'satellite', 'navigation', 'GPS', 'GNSS', 'GIS',
    'remote sensing', 'UAV', 'drone', 'positioning', 'earth observation', 'orbit',
    'launch vehicle', 'spacecraft', 'cosmonaut', 'astronaut', 'cartography',
    'geospatial', 'surveying', 'remote sensing imagery', 'geodetic', 'positioning system']
    return 1 if any(keyword in text.lower() for keyword in keywords) else 0


y = [determine_label(text) for text in input_texts]

In [73]:
!pip install gensim
import nltk
import numpy as np
from gensim.models import KeyedVectors
from gensim import downloader as api

word2vec_model = api.load("word2vec-google-news-300")

def sentence_to_vec(sentence):
    words = nltk.word_tokenize(sentence)
    vectors = [word2vec_model[word] for word in words if word in word2vec_model.key_to_index] # Use key_to_index instead of .vocab
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(300)  # Vektör boyutuna göre ayarlayın



In [74]:

#Cümleleri vektörlere dönüştürün
sentence_vectors = [sentence_to_vec(sentence) for sentence in input_texts]

# Scikit-learn'e uygun bir formata dönüştürün
X = np.array(sentence_vectors)

# Assuming 'input_texts' is your original list of text documents
X_train, X_test, y_train, y_test = train_test_split(input_texts, y, test_size=0.2, random_state=42)

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.9090909090909091
Precision: 0.9090909090909091
Recall: 1.0
F1-score: 0.0


In [75]:
# prompt: cikan sonuclari dataya yeni bir column olarak kaydet `

# Tahminleri DataFrame'e ekleyin
df['cikartik'] = pipeline.predict(input_texts)




In [83]:
# prompt: df['cikartik'] daki 0  ve 1 leri goster

print(df['cikartik'].value_counts())


cikartik
1    54
Name: count, dtype: int64
